diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml
index 4574e04bf3..1e0b4d2c20 100644
--- a/.github/ISSUE_TEMPLATE/bug_report.yml
+++ b/.github/ISSUE_TEMPLATE/bug_report.yml
@@ -52,7 +52,7 @@ body:
     attributes:
       label: Describe the bug
       description: A clear and concise description of what problem you are running into.
-      placeholder: "Attempting to compile a program via `cuda.core.experimental.Program.compile` throws a `ValueError`."
+      placeholder: "Attempting to compile a program via `cuda.core.Program.compile` throws a `ValueError`."
     validations:
       required: true
 
@@ -62,7 +62,7 @@ body:
       label: How to Reproduce
       description: Steps used to reproduce the bug.
       placeholder: |
-        0. Construct a `cuda.core.experimental.Program` instance
+        0. Construct a `cuda.core.Program` instance
         1. Call the `.compile(...)` method of the instance
         2. The call throws a `ValueError` with the following:
         ```
@@ -76,7 +76,7 @@ body:
     attributes:
       label: Expected behavior
       description: A clear and concise description of what you expected to happen.
-      placeholder: "Using `cuda.core.experimental.Program.compile(...)` should run successfully and not throw a `ValueError`"
+      placeholder: "Using `cuda.core.Program.compile(...)` should run successfully and not throw a `ValueError`"
     validations:
       required: true
 
diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml
index cbbc03c492..6d1504c4c4 100644
--- a/.github/ISSUE_TEMPLATE/feature_request.yml
+++ b/.github/ISSUE_TEMPLATE/feature_request.yml
@@ -36,7 +36,7 @@ body:
     attributes:
       label: Is your feature request related to a problem? Please describe.
       description: A clear and concise description of what the problem is, e.g., "I would like to be able to..."
-      placeholder: I would like to be able to use the equivalent of `cuda.core.experimental.Program.compile(...)` to compile my code to PTX.
+      placeholder: I would like to be able to use the equivalent of `cuda.core.Program.compile(...)` to compile my code to PTX.
     validations:
       required: true
 
@@ -46,7 +46,7 @@ body:
       label: Describe the solution you'd like
       description: A clear and concise description of what you want to happen.
       placeholder: |
-        Support a `ptx` target_type in the `cuda.core.experimental.Program.compile(...)` function.
+        Support a `ptx` target_type in the `cuda.core.Program.compile(...)` function.
     validations:
       required: true
 
@@ -57,7 +57,7 @@ body:
       description:
         If applicable, please add a clear and concise description of any alternative solutions or features you've
         considered.
-      placeholder: The alternatives to using `cuda.core.experimental.Program.compile(...)` are unappealing. They usually involve using lower level bindings to something like nvRTC or invoking the nvcc executable.
+      placeholder: The alternatives to using `cuda.core.Program.compile(...)` are unappealing. They usually involve using lower level bindings to something like nvRTC or invoking the nvcc executable.
     validations:
       required: false
 
diff --git a/.spdx-ignore b/.spdx-ignore
index c7177752e1..a9d54c6c15 100644
--- a/.spdx-ignore
+++ b/.spdx-ignore
@@ -10,6 +10,6 @@ requirements*.txt
 cuda_bindings/examples/*
 
 # Vendored
-cuda_core/cuda/core/experimental/include/dlpack.h
+cuda_core/cuda/core/include/dlpack.h
 
 qa/ctk-next.drawio.svg
diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py
index 359b98d6a7..14ed53c308 100644
--- a/ci/tools/merge_cuda_core_wheels.py
+++ b/ci/tools/merge_cuda_core_wheels.py
@@ -12,8 +12,8 @@
 
 In particular, each wheel contains a CUDA-specific build of the `cuda.core` library
 and the associated bindings. This script merges these directories into a single wheel
-that supports both CUDA versions, i.e., containing both `cuda/core/experimental/cu12`
-and `cuda/core/experimental/cu13`. At runtime, the code in `cuda/core/experimental/__init__.py`
+that supports both CUDA versions, i.e., containing both `cuda/core/cu12`
+and `cuda/core/cu13`. At runtime, the code in `cuda/core/__init__.py`
 is used to import the appropriate CUDA-specific bindings.
 
 This script is based on the one in NVIDIA/CCCL.
@@ -94,27 +94,38 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path:
         # Use the first wheel as the base and merge binaries from others
         base_wheel = extracted_wheels[0]
 
-        # now copy the version-specific directory from other wheels
-        # into the appropriate place in the base wheel
+        # Copy version-specific binaries from each wheel into versioned subdirectories
+        # Note: Python modules stay in cuda/core/, only binaries go into cu12/cu13/
+        base_dir = Path("cuda") / "core"
+
         for i, wheel_dir in enumerate(extracted_wheels):
             cuda_version = wheels[i].name.split(".cu")[1].split(".")[0]
-            base_dir = Path("cuda") / "core" / "experimental"
-            # Copy from other wheels
-            print(f"  Copying {wheel_dir} to {base_wheel}", file=sys.stderr)
-            shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}")
-
-            # Overwrite the __init__.py in versioned dirs
-            os.truncate(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", 0)
-
-        # The base dir should only contain __init__.py, the include dir, and the versioned dirs
-        files_to_remove = os.scandir(base_wheel / base_dir)
-        for f in files_to_remove:
-            f_abspath = f.path
-            if f.name not in ("__init__.py", "cu12", "cu13", "include"):
-                if f.is_dir():
-                    shutil.rmtree(f_abspath)
-                else:
-                    os.remove(f_abspath)
+            versioned_dir = base_wheel / base_dir / f"cu{cuda_version}"
+
+            # Create versioned directory
+            versioned_dir.mkdir(parents=True, exist_ok=True)
+
+            # Copy only version-specific binaries (.so, .pyd, .dll files) from the source wheel
+            # Python modules (.py, .pyx, .pxd) remain in cuda/core/
+            # Exclude versioned directories (cu12/, cu13/) to avoid recursion
+            source_dir = wheel_dir / base_dir
+            for item in source_dir.rglob("*"):
+                if item.is_dir():
+                    continue
+
+                # Skip files in versioned directories to avoid recursion
+                rel_path = item.relative_to(source_dir)
+                if any(part in ("cu12", "cu13") for part in rel_path.parts):
+                    continue
+
+                # Only copy binary files, not Python source files
+                if item.suffix in (".so", ".pyd", ".dll"):
+                    dest_item = versioned_dir / rel_path
+                    dest_item.parent.mkdir(parents=True, exist_ok=True)
+                    shutil.copy2(item, dest_item)
+
+            # Create empty __init__.py in versioned dirs
+            (versioned_dir / "__init__.py").touch()
 
         # Repack the merged wheel
         output_dir.mkdir(parents=True, exist_ok=True)
diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index e38f5676df..aef506762f 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -66,7 +66,7 @@ def _build_cuda_core():
 
     # It seems setuptools' wildcard support has problems for namespace packages,
     # so we explicitly spell out all Extension instances.
-    root_module = "cuda.core.experimental"
+    root_module = "cuda.core"
     root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep
     ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True)
 
@@ -84,11 +84,16 @@ def get_cuda_paths():
         print("CUDA paths:", CUDA_PATH)
         return CUDA_PATH
 
+    # Add local include directory for cuda/core/include
+    local_include_dirs = ["cuda/core"]
+    cuda_include_dirs = list(os.path.join(root, "include") for root in get_cuda_paths())
+    all_include_dirs = local_include_dirs + cuda_include_dirs
+
     ext_modules = tuple(
         Extension(
-            f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}",
-            sources=[f"cuda/core/experimental/{mod}.pyx"],
-            include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()),
+            f"cuda.core.{mod.replace(os.path.sep, '.')}",
+            sources=[f"cuda/core/{mod}.pyx"],
+            include_dirs=all_include_dirs,
             language="c++",
         )
         for mod in module_names
diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py
index 96a80d1f3e..d074be02d8 100644
--- a/cuda_core/cuda/core/__init__.py
+++ b/cuda_core/cuda/core/__init__.py
@@ -3,3 +3,59 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.core._version import __version__
+
+try:
+    from cuda import bindings
+except ImportError:
+    raise ImportError("cuda.bindings 12.x or 13.x must be installed") from None
+else:
+    cuda_major, cuda_minor = bindings.__version__.split(".")[:2]
+    if cuda_major not in ("12", "13"):
+        raise ImportError("cuda.bindings 12.x or 13.x must be installed")
+
+import importlib
+
+subdir = f"cu{cuda_major}"
+try:
+    versioned_mod = importlib.import_module(f".{subdir}", __package__)
+    # Import all symbols from the module
+    globals().update(versioned_mod.__dict__)
+except ImportError:
+    # This is not a wheel build, but a conda or local build, do nothing
+    pass
+else:
+    del versioned_mod
+finally:
+    del bindings, importlib, subdir, cuda_major, cuda_minor
+
+from cuda.core import utils  # noqa: E402
+from cuda.core._device import Device  # noqa: E402
+from cuda.core._event import Event, EventOptions  # noqa: E402
+from cuda.core._graph import (  # noqa: E402
+    Graph,
+    GraphBuilder,
+    GraphCompleteOptions,
+    GraphDebugPrintOptions,
+)
+from cuda.core._launch_config import LaunchConfig  # noqa: E402
+from cuda.core._launcher import launch  # noqa: E402
+from cuda.core._layout import StridedLayout  # noqa: E402
+from cuda.core._linker import Linker, LinkerOptions  # noqa: E402
+from cuda.core._memory import (  # noqa: E402
+    Buffer,
+    DeviceMemoryResource,
+    DeviceMemoryResourceOptions,
+    GraphMemoryResource,
+    LegacyPinnedMemoryResource,
+    MemoryResource,
+    VirtualMemoryResource,
+    VirtualMemoryResourceOptions,
+)
+from cuda.core._module import Kernel, ObjectCode  # noqa: E402
+from cuda.core._program import Program, ProgramOptions  # noqa: E402
+from cuda.core._stream import Stream, StreamOptions  # noqa: E402
+from cuda.core._system import System  # noqa: E402
+
+system = System()
+__import__("sys").modules[__spec__.name + ".system"] = system
+del System
diff --git a/cuda_core/cuda/core/experimental/__init__.pxd b/cuda_core/cuda/core/__init__experimental.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/__init__.pxd
rename to cuda_core/cuda/core/__init__experimental.pxd
diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/_context.pyx
similarity index 94%
rename from cuda_core/cuda/core/experimental/_context.pyx
rename to cuda_core/cuda/core/_context.pyx
index f9858c1710..c1c28b3389 100644
--- a/cuda_core/cuda/core/experimental/_context.pyx
+++ b/cuda_core/cuda/core/_context.pyx
@@ -4,7 +4,7 @@
 
 from dataclasses import dataclass
 
-from cuda.core.experimental._utils.cuda_utils import driver
+from cuda.core._utils.cuda_utils import driver
 
 
 @dataclass
diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/_device.pyx
similarity index 98%
rename from cuda_core/cuda/core/experimental/_device.pyx
rename to cuda_core/cuda/core/_device.pyx
index cd802943a5..7f1582e179 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/_device.pyx
@@ -6,27 +6,27 @@ cimport cpython
 from libc.stdint cimport uintptr_t
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 import threading
 from typing import Optional, TYPE_CHECKING, Union
 
-from cuda.core.experimental._context import Context, ContextOptions
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._graph import GraphBuilder
-from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions
-from cuda.core.experimental._utils.clear_error_support import assert_type
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._context import Context, ContextOptions
+from cuda.core._event import Event, EventOptions
+from cuda.core._graph import GraphBuilder
+from cuda.core._stream import IsStreamT, Stream, StreamOptions
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import (
     ComputeCapability,
     CUDAError,
     driver,
     handle_return,
     runtime,
 )
-from cuda.core.experimental._stream cimport default_stream
+from cuda.core._stream cimport default_stream
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._memory import Buffer, MemoryResource
+    from cuda.core._memory import Buffer, MemoryResource
 
 # TODO: I prefer to type these as "cdef object" and avoid accessing them from within Python,
 # but it seems it is very convenient to expose them for testing purposes...
@@ -1154,17 +1154,17 @@ class Device:
                     )
                 )
             if attr == 1:
-                from cuda.core.experimental._memory import DeviceMemoryResource
+                from cuda.core._memory import DeviceMemoryResource
                 self._memory_resource = DeviceMemoryResource(self._id)
             else:
-                from cuda.core.experimental._memory import _SynchronousMemoryResource
+                from cuda.core._memory import _SynchronousMemoryResource
                 self._memory_resource = _SynchronousMemoryResource(self._id)
 
         return self._memory_resource
 
     @memory_resource.setter
     def memory_resource(self, mr):
-        from cuda.core.experimental._memory import MemoryResource
+        from cuda.core._memory import MemoryResource
         assert_type(mr, MemoryResource)
         self._memory_resource = mr
 
@@ -1223,7 +1223,7 @@ class Device:
         Acts as an entry point of this object. Users always start a code by
         calling this method, e.g.
 
-        >>> from cuda.core.experimental import Device
+        >>> from cuda.core import Device
         >>> dev0 = Device(0)
         >>> dev0.set_current()
         >>> # ... do work on device 0 ...
diff --git a/cuda_core/cuda/core/experimental/_dlpack.pxd b/cuda_core/cuda/core/_dlpack.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_dlpack.pxd
rename to cuda_core/cuda/core/_dlpack.pxd
diff --git a/cuda_core/cuda/core/experimental/_dlpack.pyx b/cuda_core/cuda/core/_dlpack.pyx
similarity index 100%
rename from cuda_core/cuda/core/experimental/_dlpack.pyx
rename to cuda_core/cuda/core/_dlpack.pyx
diff --git a/cuda_core/cuda/core/experimental/_event.pxd b/cuda_core/cuda/core/_event.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_event.pxd
rename to cuda_core/cuda/core/_event.pxd
diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/_event.pyx
similarity index 98%
rename from cuda_core/cuda/core/experimental/_event.pyx
rename to cuda_core/cuda/core/_event.pyx
index 149c92b8e1..e97fdfbab4 100644
--- a/cuda_core/cuda/core/experimental/_event.pyx
+++ b/cuda_core/cuda/core/_event.pyx
@@ -8,7 +8,7 @@ cimport cpython
 from libc.stdint cimport uintptr_t
 from libc.string cimport memcpy
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._utils.cuda_utils cimport (
+from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN
 )
@@ -18,8 +18,8 @@ from dataclasses import dataclass
 import multiprocessing
 from typing import TYPE_CHECKING, Optional
 
-from cuda.core.experimental._context import Context
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._context import Context
+from cuda.core._utils.cuda_utils import (
     CUDAError,
     check_multiprocessing_start_method,
     driver,
diff --git a/cuda_core/cuda/core/experimental/_graph.py b/cuda_core/cuda/core/_graph.py
similarity index 99%
rename from cuda_core/cuda/core/experimental/_graph.py
rename to cuda_core/cuda/core/_graph.py
index a82bd70f55..df51126bb0 100644
--- a/cuda_core/cuda/core/experimental/_graph.py
+++ b/cuda_core/cuda/core/_graph.py
@@ -9,8 +9,8 @@
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._utils.cuda_utils import (
+    from cuda.core._stream import Stream
+from cuda.core._utils.cuda_utils import (
     driver,
     get_binding_version,
     handle_return,
diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx
similarity index 99%
rename from cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx
rename to cuda_core/cuda/core/_kernel_arg_handler.pyx
index 4cac74a25f..6374605262 100644
--- a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx
+++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx
@@ -15,8 +15,8 @@ import ctypes
 
 import numpy
 
-from cuda.core.experimental._memory import Buffer
-from cuda.core.experimental._utils.cuda_utils import driver
+from cuda.core._memory import Buffer
+from cuda.core._utils.cuda_utils import driver
 from cuda.bindings cimport cydriver
 
 
diff --git a/cuda_core/cuda/core/experimental/_launch_config.pxd b/cuda_core/cuda/core/_launch_config.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_launch_config.pxd
rename to cuda_core/cuda/core/_launch_config.pxd
diff --git a/cuda_core/cuda/core/experimental/_launch_config.pyx b/cuda_core/cuda/core/_launch_config.pyx
similarity index 98%
rename from cuda_core/cuda/core/experimental/_launch_config.pyx
rename to cuda_core/cuda/core/_launch_config.pyx
index 7d6a1ab2b9..00c71ad903 100644
--- a/cuda_core/cuda/core/experimental/_launch_config.pyx
+++ b/cuda_core/cuda/core/_launch_config.pyx
@@ -2,8 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._device import Device
+from cuda.core._utils.cuda_utils import (
     CUDAError,
     cast_to_3_tuple,
     driver,
diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx
similarity index 88%
rename from cuda_core/cuda/core/experimental/_launcher.pyx
rename to cuda_core/cuda/core/_launcher.pyx
index 2cba15cbf4..09900a668c 100644
--- a/cuda_core/cuda/core/experimental/_launcher.pyx
+++ b/cuda_core/cuda/core/_launcher.pyx
@@ -1,15 +1,15 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
-from cuda.core.experimental._launch_config cimport LaunchConfig, _to_native_launch_config
-from cuda.core.experimental._stream cimport Stream_accept
+from cuda.core._launch_config cimport LaunchConfig, _to_native_launch_config
+from cuda.core._stream cimport Stream_accept
 
 
-from cuda.core.experimental._kernel_arg_handler import ParamHolder
-from cuda.core.experimental._module import Kernel
-from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._utils.clear_error_support import assert_type
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._kernel_arg_handler import ParamHolder
+from cuda.core._module import Kernel
+from cuda.core._stream import Stream
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import (
     _reduce_3_tuple,
     check_or_create_options,
     driver,
diff --git a/cuda_core/cuda/core/experimental/_layout.pxd b/cuda_core/cuda/core/_layout.pxd
similarity index 99%
rename from cuda_core/cuda/core/experimental/_layout.pxd
rename to cuda_core/cuda/core/_layout.pxd
index 301cdaaa65..d2306117e0 100644
--- a/cuda_core/cuda/core/experimental/_layout.pxd
+++ b/cuda_core/cuda/core/_layout.pxd
@@ -18,7 +18,7 @@ ctypedef uint32_t property_mask_t
 ctypedef vector.vector[stride_t] extents_strides_t
 ctypedef vector.vector[axis_t] axis_vec_t
 
-from cuda.core.experimental._utils cimport cuda_utils
+from cuda.core._utils cimport cuda_utils
 
 
 ctypedef fused integer_t:
diff --git a/cuda_core/cuda/core/experimental/_layout.pyx b/cuda_core/cuda/core/_layout.pyx
similarity index 100%
rename from cuda_core/cuda/core/experimental/_layout.pyx
rename to cuda_core/cuda/core/_layout.pyx
diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/_linker.py
similarity index 98%
rename from cuda_core/cuda/core/experimental/_linker.py
rename to cuda_core/cuda/core/_linker.py
index 5c54a88c8c..2d2e47bb3e 100644
--- a/cuda_core/cuda/core/experimental/_linker.py
+++ b/cuda_core/cuda/core/_linker.py
@@ -15,10 +15,10 @@
 if TYPE_CHECKING:
     import cuda.bindings
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils.clear_error_support import assert_type
-from cuda.core.experimental._utils.cuda_utils import check_or_create_options, driver, handle_return, is_sequence
+from cuda.core._device import Device
+from cuda.core._module import ObjectCode
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import check_or_create_options, driver, handle_return, is_sequence
 
 # TODO: revisit this treatment for py313t builds
 _driver = None  # populated if nvJitLink cannot be used
@@ -355,7 +355,7 @@ def _exception_manager(self):
 
 class Linker:
     """Represent a linking machinery to link one or multiple object codes into
-    :obj:`~cuda.core.experimental._module.ObjectCode` with the specified options.
+    :obj:`~cuda.core._module.ObjectCode` with the specified options.
 
     This object provides a unified interface to multiple underlying
     linker libraries (such as nvJitLink or cuLink* from CUDA driver).
diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/_memory/__init__.py
similarity index 100%
rename from cuda_core/cuda/core/experimental/_memory/__init__.py
rename to cuda_core/cuda/core/_memory/__init__.py
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd
similarity index 92%
rename from cuda_core/cuda/core/experimental/_memory/_buffer.pxd
rename to cuda_core/cuda/core/_memory/_buffer.pxd
index b581dcd293..730e448f63 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd
+++ b/cuda_core/cuda/core/_memory/_buffer.pxd
@@ -4,7 +4,7 @@
 
 from libc.stdint cimport uintptr_t
 
-from cuda.core.experimental._stream cimport Stream
+from cuda.core._stream cimport Stream
 
 
 cdef struct _MemAttrs:
diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx
similarity index 97%
rename from cuda_core/cuda/core/experimental/_memory/_buffer.pyx
rename to cuda_core/cuda/core/_memory/_buffer.pyx
index b26471ed0e..45c30aa933 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/_memory/_buffer.pyx
@@ -8,18 +8,18 @@ cimport cython
 from libc.stdint cimport uintptr_t, int64_t, uint64_t
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
-from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor, IPCDataForBuffer
-from cuda.core.experimental._memory cimport _ipc
-from cuda.core.experimental._stream cimport Stream_accept, Stream
-from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._memory._device_memory_resource cimport DeviceMemoryResource
+from cuda.core._memory._ipc cimport IPCBufferDescriptor, IPCDataForBuffer
+from cuda.core._memory cimport _ipc
+from cuda.core._stream cimport Stream_accept, Stream
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 import abc
 from typing import TypeVar, Union
 
-from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule
-from cuda.core.experimental._utils.cuda_utils import driver
-from cuda.core.experimental._device import Device
+from cuda.core._dlpack import DLDeviceType, make_py_capsule
+from cuda.core._utils.cuda_utils import driver
+from cuda.core._device import Device
 
 __all__ = ['Buffer', 'MemoryResource']
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd
similarity index 82%
rename from cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
rename to cuda_core/cuda/core/_memory/_device_memory_resource.pxd
index 823a270b27..4c7482a5fe 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport MemoryResource
-from cuda.core.experimental._memory._ipc cimport IPCDataForMR
+from cuda.core._memory._buffer cimport MemoryResource
+from cuda.core._memory._ipc cimport IPCDataForMR
 
 
 cdef class DeviceMemoryResource(MemoryResource):
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
similarity index 98%
rename from cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
rename to cuda_core/cuda/core/_memory/_device_memory_resource.pyx
index ac18079a62..bc286dd62e 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx
@@ -10,11 +10,11 @@ from libc.stdlib cimport malloc, free
 from libc.string cimport memset
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
-from cuda.core.experimental._memory cimport _ipc
-from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCDataForMR
-from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream
-from cuda.core.experimental._utils.cuda_utils cimport (
+from cuda.core._memory._buffer cimport Buffer, MemoryResource
+from cuda.core._memory cimport _ipc
+from cuda.core._memory._ipc cimport IPCAllocationHandle, IPCDataForMR
+from cuda.core._stream cimport default_stream, Stream_accept, Stream
+from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN,
 )
@@ -25,10 +25,10 @@ import platform  # no-cython-lint
 import uuid
 import weakref
 
-from cuda.core.experimental._utils.cuda_utils import driver
+from cuda.core._utils.cuda_utils import driver
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._memory.buffer import DevicePointerT
+    from cuda.core._memory.buffer import DevicePointerT
     from .._device import Device
 
 __all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions']
diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd b/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd
similarity index 77%
rename from cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
rename to cuda_core/cuda/core/_memory/_graph_memory_resource.pxd
index f9c7798e76..2f6c35d72e 100644
--- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd
+++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd
@@ -2,7 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental._memory._buffer cimport MemoryResource
+from cuda.core._memory._buffer cimport MemoryResource
 
 
 cdef class cyGraphMemoryResource(MemoryResource):
diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
similarity index 96%
rename from cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
rename to cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
index c65354b612..bda075c201 100644
--- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx
+++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx
@@ -7,15 +7,15 @@ from __future__ import annotations
 from libc.stdint cimport intptr_t
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
-from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream
-from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._memory._buffer cimport Buffer, MemoryResource
+from cuda.core._stream cimport default_stream, Stream_accept, Stream
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
 
 from functools import cache
 from typing import TYPE_CHECKING
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._memory.buffer import DevicePointerT
+    from cuda.core._memory.buffer import DevicePointerT
 
 __all__ = ['GraphMemoryResource']
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd b/cuda_core/cuda/core/_memory/_ipc.pxd
similarity index 92%
rename from cuda_core/cuda/core/experimental/_memory/_ipc.pxd
rename to cuda_core/cuda/core/_memory/_ipc.pxd
index 60d96a3b33..c8d4a8a9ae 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
+++ b/cuda_core/cuda/core/_memory/_ipc.pxd
@@ -3,8 +3,8 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport Buffer
-from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
+from cuda.core._memory._buffer cimport Buffer
+from cuda.core._memory._device_memory_resource cimport DeviceMemoryResource
 
 
 # Holds DeviceMemoryResource objects imported by this process.  This enables
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/_memory/_ipc.pyx
similarity index 96%
rename from cuda_core/cuda/core/experimental/_memory/_ipc.pyx
rename to cuda_core/cuda/core/_memory/_ipc.pyx
index c9931855cf..81ae52f6b0 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/_memory/_ipc.pyx
@@ -7,10 +7,10 @@ from libc.stdint cimport uintptr_t
 from libc.string cimport memcpy
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport Buffer
-from cuda.core.experimental._stream cimport default_stream
-from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
-from cuda.core.experimental._utils.cuda_utils import check_multiprocessing_start_method
+from cuda.core._memory._buffer cimport Buffer
+from cuda.core._stream cimport default_stream
+from cuda.core._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core._utils.cuda_utils import check_multiprocessing_start_method
 
 import multiprocessing
 import os
diff --git a/cuda_core/cuda/core/experimental/_memory/_legacy.py b/cuda_core/cuda/core/_memory/_legacy.py
similarity index 89%
rename from cuda_core/cuda/core/experimental/_memory/_legacy.py
rename to cuda_core/cuda/core/_memory/_legacy.py
index 09ea0e15d2..317494ea9e 100644
--- a/cuda_core/cuda/core/experimental/_memory/_legacy.py
+++ b/cuda_core/cuda/core/_memory/_legacy.py
@@ -6,16 +6,16 @@
 
 from typing import TYPE_CHECKING
 
-from cuda.core.experimental._memory._buffer import Buffer, MemoryResource
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._memory._buffer import Buffer, MemoryResource
+from cuda.core._utils.cuda_utils import (
     _check_driver_error as raise_if_driver_error,
 )
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._utils.cuda_utils import (
     driver,
 )
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._memory.buffer import DevicePointerT
+    from cuda.core._memory.buffer import DevicePointerT
 
 __all__ = ["LegacyPinnedMemoryResource", "_SynchronousMemoryResource"]
 
@@ -43,7 +43,7 @@ def allocate(self, size, stream=None) -> Buffer:
             The allocated buffer object, which is accessible on both host and device.
         """
         if stream is None:
-            from cuda.core.experimental._stream import default_stream
+            from cuda.core._stream import default_stream
 
             stream = default_stream()
         err, ptr = driver.cuMemAllocHost(size)
@@ -93,7 +93,7 @@ def __init__(self, device_id):
 
     def allocate(self, size, stream=None) -> Buffer:
         if stream is None:
-            from cuda.core.experimental._stream import default_stream
+            from cuda.core._stream import default_stream
 
             stream = default_stream()
         err, ptr = driver.cuMemAlloc(size)
diff --git a/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py
similarity index 98%
rename from cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py
rename to cuda_core/cuda/core/_memory/_virtual_memory_resource.py
index 2806e2d0d5..43da00744a 100644
--- a/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py
+++ b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py
@@ -7,20 +7,20 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Iterable, Literal, Union
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._memory._buffer import Buffer, MemoryResource
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._device import Device
+from cuda.core._memory._buffer import Buffer, MemoryResource
+from cuda.core._utils.cuda_utils import (
     Transaction,
     check_or_create_options,
     driver,
     get_binding_version,
 )
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._utils.cuda_utils import (
     _check_driver_error as raise_if_driver_error,
 )
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._stream import Stream
+    from cuda.core._stream import Stream
 
 __all__ = ["VirtualMemoryResourceOptions", "VirtualMemoryResource"]
 
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx
similarity index 99%
rename from cuda_core/cuda/core/experimental/_memoryview.pyx
rename to cuda_core/cuda/core/_memoryview.pyx
index dc972d912a..6c329dc4ff 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/_memoryview.pyx
@@ -4,8 +4,8 @@
 
 from ._dlpack cimport *
 from libc.stdint cimport intptr_t
-from cuda.core.experimental._layout cimport StridedLayout
-from cuda.core.experimental._stream import Stream
+from cuda.core._layout cimport StridedLayout
+from cuda.core._stream import Stream
 
 import functools
 import warnings
@@ -13,10 +13,10 @@ from typing import Optional
 
 import numpy
 
-from cuda.core.experimental._utils.cuda_utils import handle_return, driver
+from cuda.core._utils.cuda_utils import handle_return, driver
 
 
-from cuda.core.experimental._memory import Buffer
+from cuda.core._memory import Buffer
 
 # TODO(leofang): support NumPy structured dtypes
 
diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/_module.py
similarity index 98%
rename from cuda_core/cuda/core/experimental/_module.py
rename to cuda_core/cuda/core/_module.py
index 9af722465b..fbea314406 100644
--- a/cuda_core/cuda/core/experimental/_module.py
+++ b/cuda_core/cuda/core/_module.py
@@ -7,15 +7,15 @@
 from typing import Union
 from warnings import warn
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config
-from cuda.core.experimental._stream import Stream
-from cuda.core.experimental._utils.clear_error_support import (
+from cuda.core._device import Device
+from cuda.core._launch_config import LaunchConfig, _to_native_launch_config
+from cuda.core._stream import Stream
+from cuda.core._utils.clear_error_support import (
     assert_type,
     assert_type_str_or_bytes_like,
     raise_code_path_meant_to_be_unreachable,
 )
-from cuda.core.experimental._utils.cuda_utils import driver, get_binding_version, handle_return, precondition
+from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return, precondition
 
 _backend = {
     "old": {
@@ -453,7 +453,7 @@ class ObjectCode:
     This class has no default constructor. If you already have a cubin that you would
     like to load, use the :meth:`from_cubin` alternative constructor. Constructing directly
     from all other possible code types should be avoided in favor of compilation through
-    :class:`~cuda.core.experimental.Program`
+    :class:`~cuda.core.Program`
 
     Note
     ----
diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/_program.py
similarity index 98%
rename from cuda_core/cuda/core/experimental/_program.py
rename to cuda_core/cuda/core/_program.py
index cdef7c3be6..6344991e30 100644
--- a/cuda_core/cuda/core/experimental/_program.py
+++ b/cuda_core/cuda/core/_program.py
@@ -13,11 +13,11 @@
 if TYPE_CHECKING:
     import cuda.bindings
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._linker import Linker, LinkerHandleT, LinkerOptions
-from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils.clear_error_support import assert_type
-from cuda.core.experimental._utils.cuda_utils import (
+from cuda.core._device import Device
+from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions
+from cuda.core._module import ObjectCode
+from cuda.core._utils.clear_error_support import assert_type
+from cuda.core._utils.cuda_utils import (
     _handle_boolean_option,
     check_or_create_options,
     driver,
diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/_stream.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_stream.pxd
rename to cuda_core/cuda/core/_stream.pxd
diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/_stream.pyx
similarity index 97%
rename from cuda_core/cuda/core/experimental/_stream.pyx
rename to cuda_core/cuda/core/_stream.pyx
index 87ec4a691a..b724f9aee3 100644
--- a/cuda_core/cuda/core/experimental/_stream.pyx
+++ b/cuda_core/cuda/core/_stream.pyx
@@ -9,8 +9,8 @@ from libc.stdlib cimport strtol, getenv
 
 from cuda.bindings cimport cydriver
 
-from cuda.core.experimental._event cimport Event as cyEvent
-from cuda.core.experimental._utils.cuda_utils cimport (
+from cuda.core._event cimport Event as cyEvent
+from cuda.core._utils.cuda_utils cimport (
     check_or_create_options,
     CU_CONTEXT_INVALID,
     get_device_from_ctx,
@@ -24,11 +24,11 @@ from typing import TYPE_CHECKING, Optional, Protocol, Union
 
 if TYPE_CHECKING:
     import cuda.bindings
-    from cuda.core.experimental._device import Device
-from cuda.core.experimental._context import Context
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._graph import GraphBuilder
-from cuda.core.experimental._utils.cuda_utils import (
+    from cuda.core._device import Device
+from cuda.core._context import Context
+from cuda.core._event import Event, EventOptions
+from cuda.core._graph import GraphBuilder
+from cuda.core._utils.cuda_utils import (
     driver,
 )
 
@@ -311,7 +311,7 @@ cdef class Stream:
         context is set current after a stream is created.
 
         """
-        from cuda.core.experimental._device import Device  # avoid circular import
+        from cuda.core._device import Device  # avoid circular import
         self._get_device_and_context()
         return Device(<int>(self._device_id))
 
diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/_system.py
similarity index 92%
rename from cuda_core/cuda/core/experimental/_system.py
rename to cuda_core/cuda/core/_system.py
index cbbc1a83cb..a8338114b2 100644
--- a/cuda_core/cuda/core/experimental/_system.py
+++ b/cuda_core/cuda/core/_system.py
@@ -2,8 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental._device import Device
-from cuda.core.experimental._utils.cuda_utils import driver, handle_return, runtime
+from cuda.core._device import Device
+from cuda.core._utils.cuda_utils import driver, handle_return, runtime
 
 
 class System:
diff --git a/cuda_core/cuda/core/experimental/_utils/__init__.pxd b/cuda_core/cuda/core/_utils/__init__.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_utils/__init__.pxd
rename to cuda_core/cuda/core/_utils/__init__.pxd
diff --git a/cuda_core/cuda/core/experimental/_utils/__init__.py b/cuda_core/cuda/core/_utils/__init__.py
similarity index 100%
rename from cuda_core/cuda/core/experimental/_utils/__init__.py
rename to cuda_core/cuda/core/_utils/__init__.py
diff --git a/cuda_core/cuda/core/experimental/_utils/clear_error_support.py b/cuda_core/cuda/core/_utils/clear_error_support.py
similarity index 100%
rename from cuda_core/cuda/core/experimental/_utils/clear_error_support.py
rename to cuda_core/cuda/core/_utils/clear_error_support.py
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd
similarity index 100%
rename from cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd
rename to cuda_core/cuda/core/_utils/cuda_utils.pxd
diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx
similarity index 98%
rename from cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
rename to cuda_core/cuda/core/_utils/cuda_utils.pyx
index 4489871747..0c3f6521a4 100644
--- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx
+++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx
@@ -20,8 +20,8 @@ except ImportError:
     from cuda import cudart as runtime
     from cuda import nvrtc
 
-from cuda.core.experimental._utils.driver_cu_result_explanations import DRIVER_CU_RESULT_EXPLANATIONS
-from cuda.core.experimental._utils.runtime_cuda_error_explanations import RUNTIME_CUDA_ERROR_EXPLANATIONS
+from cuda.core._utils.driver_cu_result_explanations import DRIVER_CU_RESULT_EXPLANATIONS
+from cuda.core._utils.runtime_cuda_error_explanations import RUNTIME_CUDA_ERROR_EXPLANATIONS
 
 
 class CUDAError(Exception):
diff --git a/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py b/cuda_core/cuda/core/_utils/driver_cu_result_explanations.py
similarity index 100%
rename from cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py
rename to cuda_core/cuda/core/_utils/driver_cu_result_explanations.py
diff --git a/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py b/cuda_core/cuda/core/_utils/runtime_cuda_error_explanations.py
similarity index 100%
rename from cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py
rename to cuda_core/cuda/core/_utils/runtime_cuda_error_explanations.py
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 826ea70b97..f937a3795a 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -2,43 +2,59 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-try:
-    from cuda import bindings
-except ImportError:
-    raise ImportError("cuda.bindings 12.x or 13.x must be installed") from None
-else:
-    cuda_major, cuda_minor = bindings.__version__.split(".")[:2]
-    if cuda_major not in ("12", "13"):
-        raise ImportError("cuda.bindings 12.x or 13.x must be installed")
-
-import importlib
-
-subdir = f"cu{cuda_major}"
-try:
-    versioned_mod = importlib.import_module(f".{subdir}", __package__)
-    # Import all symbols from the module
-    globals().update(versioned_mod.__dict__)
-except ImportError:
-    # This is not a wheel build, but a conda or local build, do nothing
-    pass
-else:
-    del versioned_mod
-finally:
-    del bindings, importlib, subdir, cuda_major, cuda_minor
-
-from cuda.core.experimental import utils  # noqa: E402
-from cuda.core.experimental._device import Device  # noqa: E402
-from cuda.core.experimental._event import Event, EventOptions  # noqa: E402
-from cuda.core.experimental._graph import (  # noqa: E402
+"""
+Backward compatibility stubs for cuda.core.experimental namespace.
+
+This module provides forwarding stubs that import from the new cuda.core.*
+locations and emit deprecation warnings. Users should migrate to importing
+directly from cuda.core instead of cuda.core.experimental.
+
+The experimental namespace will be removed in a future release.
+"""
+
+import warnings
+
+
+def _warn_deprecated():
+    """Emit a deprecation warning for using the experimental namespace.
+
+    Note: This warning is only when the experimental module is first imported.
+    Subsequent accesses to attributes (like utils, Device, etc.) do not trigger
+    additional warnings since they are already set in the module namespace.
+    Only accessing submodules via __getattr__ (e.g., _device, _utils) will trigger
+    additional warnings.
+    """
+    warnings.warn(
+        "The cuda.core.experimental namespace is deprecated. "
+        "Please import directly from cuda.core instead. "
+        "For example, use 'from cuda.core import Device' instead of "
+        "'from cuda.core.experimental import Device'. "
+        "The experimental namespace will be removed in a future release.",
+        DeprecationWarning,
+        stacklevel=3,
+    )
+
+
+# Import from new locations and re-export
+_warn_deprecated()
+
+from cuda.core import utils  # noqa: E402
+
+# Make utils accessible as a submodule for backward compatibility
+__import__("sys").modules[__spec__.name + ".utils"] = utils
+from cuda.core._device import Device  # noqa: E402
+from cuda.core._event import Event, EventOptions  # noqa: E402
+from cuda.core._graph import (  # noqa: E402
     Graph,
     GraphBuilder,
     GraphCompleteOptions,
     GraphDebugPrintOptions,
 )
-from cuda.core.experimental._launch_config import LaunchConfig  # noqa: E402
-from cuda.core.experimental._launcher import launch  # noqa: E402
-from cuda.core.experimental._linker import Linker, LinkerOptions  # noqa: E402
-from cuda.core.experimental._memory import (  # noqa: E402
+from cuda.core._launch_config import LaunchConfig  # noqa: E402
+from cuda.core._launcher import launch  # noqa: E402
+from cuda.core._layout import StridedLayout  # noqa: E402
+from cuda.core._linker import Linker, LinkerOptions  # noqa: E402
+from cuda.core._memory import (  # noqa: E402
     Buffer,
     DeviceMemoryResource,
     DeviceMemoryResourceOptions,
@@ -48,11 +64,47 @@
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
-from cuda.core.experimental._module import Kernel, ObjectCode  # noqa: E402
-from cuda.core.experimental._program import Program, ProgramOptions  # noqa: E402
-from cuda.core.experimental._stream import Stream, StreamOptions  # noqa: E402
-from cuda.core.experimental._system import System  # noqa: E402
+from cuda.core._module import Kernel, ObjectCode  # noqa: E402
+from cuda.core._program import Program, ProgramOptions  # noqa: E402
+from cuda.core._stream import Stream, StreamOptions  # noqa: E402
+from cuda.core._system import System  # noqa: E402
 
 system = System()
 __import__("sys").modules[__spec__.name + ".system"] = system
 del System
+
+
+# Also create forwarding stubs for submodules
+# These will be imported lazily when accessed
+def __getattr__(name):
+    """Forward attribute access to the new location with deprecation warning."""
+    if name in (
+        "_context",
+        "_device",
+        "_dlpack",
+        "_event",
+        "_graph",
+        "_kernel_arg_handler",
+        "_launch_config",
+        "_launcher",
+        "_layout",
+        "_linker",
+        "_memory",
+        "_memoryview",
+        "_module",
+        "_program",
+        "_stream",
+        "_system",
+        "_utils",
+    ):
+        _warn_deprecated()
+        # Import the submodule from the new location
+        import importlib
+
+        new_name = name.lstrip("_")
+        try:
+            return importlib.import_module(f"cuda.core.{new_name}")
+        except ImportError:
+            # Fallback to underscore-prefixed name
+            return importlib.import_module(f"cuda.core.{name}")
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/cuda_core/cuda/core/experimental/include/dlpack.h b/cuda_core/cuda/core/include/dlpack.h
similarity index 100%
rename from cuda_core/cuda/core/experimental/include/dlpack.h
rename to cuda_core/cuda/core/include/dlpack.h
diff --git a/cuda_core/cuda/core/experimental/include/layout.hpp b/cuda_core/cuda/core/include/layout.hpp
similarity index 100%
rename from cuda_core/cuda/core/experimental/include/layout.hpp
rename to cuda_core/cuda/core/include/layout.hpp
diff --git a/cuda_core/cuda/core/experimental/include/utility.hpp b/cuda_core/cuda/core/include/utility.hpp
similarity index 100%
rename from cuda_core/cuda/core/experimental/include/utility.hpp
rename to cuda_core/cuda/core/include/utility.hpp
diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/utils.py
similarity index 66%
rename from cuda_core/cuda/core/experimental/utils.py
rename to cuda_core/cuda/core/utils.py
index 3227f1eae1..177adaef2f 100644
--- a/cuda_core/cuda/core/experimental/utils.py
+++ b/cuda_core/cuda/core/utils.py
@@ -2,8 +2,8 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental._layout import StridedLayout  # noqa: F401
-from cuda.core.experimental._memoryview import (
+from cuda.core._layout import StridedLayout  # noqa: F401
+from cuda.core._memoryview import (
     StridedMemoryView,  # noqa: F401
     args_viewable_as_strided_memory,  # noqa: F401
 )
diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py
index bab2a2b942..47147965ff 100644
--- a/cuda_core/docs/source/conf.py
+++ b/cuda_core/docs/source/conf.py
@@ -129,8 +129,8 @@ def skip_member(app, what, name, obj, skip, options):
     # are assumed to be properties (because cythonized
     # properties are not recognized as such by autodoc)
     excluded_dirs = [
-        "cuda.core.experimental._layout",
-        "cuda.core.experimental._memoryview",
+        "cuda.core._layout",
+        "cuda.core._memoryview",
     ]
     if what == "attribute" and getattr(obj, "__doc__", None) is None:
         obj_module = getattr(getattr(obj, "__objclass__", None), "__module__", None)
diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py
index 2d2d9833fb..9cc759b500 100644
--- a/cuda_core/examples/cuda_graphs.py
+++ b/cuda_core/examples/cuda_graphs.py
@@ -13,7 +13,7 @@
 import time
 
 import cupy as cp
-from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
 
 
 def main():
diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py
index d1553f6b67..b0040708b6 100644
--- a/cuda_core/examples/jit_lto_fractal.py
+++ b/cuda_core/examples/jit_lto_fractal.py
@@ -25,7 +25,7 @@
 import sys
 
 import cupy as cp
-from cuda.core.experimental import Device, LaunchConfig, Linker, LinkerOptions, Program, ProgramOptions, launch
+from cuda.core import Device, LaunchConfig, Linker, LinkerOptions, Program, ProgramOptions, launch
 
 
 # ################################################################################
diff --git a/cuda_core/examples/memory_ops.py b/cuda_core/examples/memory_ops.py
index c4abd06e2c..123b1f6a11 100644
--- a/cuda_core/examples/memory_ops.py
+++ b/cuda_core/examples/memory_ops.py
@@ -16,7 +16,7 @@
 
 import cupy as cp
 import numpy as np
-from cuda.core.experimental import (
+from cuda.core import (
     Device,
     LaunchConfig,
     LegacyPinnedMemoryResource,
diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py
index ea067302b9..433d63c9eb 100644
--- a/cuda_core/examples/pytorch_example.py
+++ b/cuda_core/examples/pytorch_example.py
@@ -15,7 +15,7 @@
 import sys
 
 import torch
-from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # SAXPY kernel - passing a as a pointer to avoid any type issues
 code = """
diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py
index f38caef392..aa0d77eff9 100644
--- a/cuda_core/examples/saxpy.py
+++ b/cuda_core/examples/saxpy.py
@@ -14,7 +14,7 @@
 import sys
 
 import cupy as cp
-from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # compute out = a * x + y
 code = """
diff --git a/cuda_core/examples/show_device_properties.py b/cuda_core/examples/show_device_properties.py
index 8fcecd2d4c..41609de8e5 100644
--- a/cuda_core/examples/show_device_properties.py
+++ b/cuda_core/examples/show_device_properties.py
@@ -11,7 +11,7 @@
 
 import sys
 
-from cuda.core.experimental import Device, system
+from cuda.core import Device, system
 
 
 # Convert boolean to YES or NO string
diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py
index d91ab2c856..c53c1b518a 100644
--- a/cuda_core/examples/simple_multi_gpu_example.py
+++ b/cuda_core/examples/simple_multi_gpu_example.py
@@ -12,7 +12,7 @@
 import sys
 
 import cupy as cp
-from cuda.core.experimental import Device, LaunchConfig, Program, launch, system
+from cuda.core import Device, LaunchConfig, Program, launch, system
 
 if system.num_devices < 2:
     print("this example requires at least 2 GPUs", file=sys.stderr)
diff --git a/cuda_core/examples/strided_memory_view_cpu.py b/cuda_core/examples/strided_memory_view_cpu.py
index de6007fd26..a20377cc76 100644
--- a/cuda_core/examples/strided_memory_view_cpu.py
+++ b/cuda_core/examples/strided_memory_view_cpu.py
@@ -26,7 +26,7 @@
     print("cffi is not installed, the CPU example will be skipped", file=sys.stderr)
     FFI = None
 import numpy as np
-from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
+from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 # ################################################################################
 #
diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py
index 3e456776a8..e91ddc25cc 100644
--- a/cuda_core/examples/strided_memory_view_gpu.py
+++ b/cuda_core/examples/strided_memory_view_gpu.py
@@ -23,8 +23,8 @@
     print("cupy is not installed, the GPU example will be skipped", file=sys.stderr)
     cp = None
 import numpy as np
-from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
-from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 # ################################################################################
 #
diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py
index e14158f8bd..f1ea8b8579 100644
--- a/cuda_core/examples/thread_block_cluster.py
+++ b/cuda_core/examples/thread_block_cluster.py
@@ -13,7 +13,7 @@
 import sys
 
 import numpy as np
-from cuda.core.experimental import (
+from cuda.core import (
     Device,
     LaunchConfig,
     LegacyPinnedMemoryResource,
diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py
index 2851303c7e..d31ab77208 100644
--- a/cuda_core/examples/vector_add.py
+++ b/cuda_core/examples/vector_add.py
@@ -10,7 +10,7 @@
 # ################################################################################
 
 import cupy as cp
-from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch
 
 # compute c = a + b
 code = """
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index af99ddd361..d25442258d 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -69,7 +69,7 @@ issues = "https://github.com/NVIDIA/cuda-python/issues/"
 include = ["cuda.core*"]
 
 [tool.setuptools.package-data]
-"cuda.core.experimental.include" = ["*.h", "*.hpp", "*.cuh"]
+"cuda.core.include" = ["*.h", "*.hpp", "*.cuh"]
 
 [tool.setuptools.dynamic]
 version = { attr = "cuda.core._version.__version__" }
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index c0ea03930e..d39cba75ab 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -4,6 +4,7 @@
 import multiprocessing
 import os
 
+import cuda.core
 import helpers
 import pytest
 
@@ -12,9 +13,8 @@
 except ImportError:
     from cuda import cuda as driver
 
-import cuda.core.experimental
-from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions, _device
-from cuda.core.experimental._utils.cuda_utils import handle_return
+from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions, _device
+from cuda.core._utils.cuda_utils import handle_return
 
 
 @pytest.fixture(scope="session", autouse=True)
@@ -124,7 +124,7 @@ def mempool_device():
 
 
 def _mempool_device_impl(num):
-    num_devices = len(cuda.core.experimental.system.devices)
+    num_devices = len(cuda.core.system.devices)
     if num_devices < num:
         pytest.skip(f"Test requires at least {num} GPUs")
 
diff --git a/cuda_core/tests/cython/build_tests.sh b/cuda_core/tests/cython/build_tests.sh
index eb3303840d..98851edefa 100755
--- a/cuda_core/tests/cython/build_tests.sh
+++ b/cuda_core/tests/cython/build_tests.sh
@@ -6,10 +6,10 @@
 UNAME=$(uname)
 if [ "$UNAME" == "Linux" ] ; then
   SCRIPTPATH=$(dirname $(realpath "$0"))
-  export CPLUS_INCLUDE_PATH=${SCRIPTPATH}/../../cuda/core/experimental/include:$CUDA_HOME/include:$CPLUS_INCLUDE_PATH
+  export CPLUS_INCLUDE_PATH=${SCRIPTPATH}/../../cuda/core/include:$CUDA_HOME/include:$CPLUS_INCLUDE_PATH
 elif [[ "$UNAME" == CYGWIN* || "$UNAME" == MINGW* || "$UNAME" == MSYS* ]] ; then
   SCRIPTPATH="$(dirname $(cygpath -w $(realpath "$0")))"
-  CUDA_CORE_INCLUDE_PATH=$(echo "${SCRIPTPATH}\..\..\cuda\core\experimental\include" | sed 's/\\/\\\\/g')
+  CUDA_CORE_INCLUDE_PATH=$(echo "${SCRIPTPATH}\..\..\cuda\core\include" | sed 's/\\/\\\\/g')
   export CL="/I\"${CUDA_CORE_INCLUDE_PATH}\" /I\"${CUDA_HOME}\\include\" ${CL}"
 else
   exit 1
diff --git a/cuda_core/tests/cython/test_get_cuda_native_handle.pyx b/cuda_core/tests/cython/test_get_cuda_native_handle.pyx
index 0c3921e925..2b105e13ae 100644
--- a/cuda_core/tests/cython/test_get_cuda_native_handle.pyx
+++ b/cuda_core/tests/cython/test_get_cuda_native_handle.pyx
@@ -13,7 +13,7 @@ from cuda.bindings.nvrtc cimport nvrtcProgram as pynvrtcProgram
 from cuda.bindings.cydriver cimport CUstream, CUevent
 from cuda.bindings.cynvrtc cimport nvrtcProgram
 
-from cuda.core.experimental import Device, Program
+from cuda.core import Device, Program
 
 
 cdef extern from "utility.hpp":
diff --git a/cuda_core/tests/example_tests/test_basic_examples.py b/cuda_core/tests/example_tests/test_basic_examples.py
index 450c60bf06..640b53c2fc 100644
--- a/cuda_core/tests/example_tests/test_basic_examples.py
+++ b/cuda_core/tests/example_tests/test_basic_examples.py
@@ -7,7 +7,7 @@
 import os
 
 import pytest
-from cuda.core.experimental import Device
+from cuda.core import Device
 
 from .utils import run_example
 
diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py
index b4d769eab3..3004cd0d00 100644
--- a/cuda_core/tests/helpers/buffers.py
+++ b/cuda_core/tests/helpers/buffers.py
@@ -3,8 +3,8 @@
 
 import ctypes
 
-from cuda.core.experimental import Buffer, Device, MemoryResource
-from cuda.core.experimental._utils.cuda_utils import driver, handle_return
+from cuda.core import Buffer, Device, MemoryResource
+from cuda.core._utils.cuda_utils import driver, handle_return
 
 from . import libc
 
diff --git a/cuda_core/tests/helpers/latch.py b/cuda_core/tests/helpers/latch.py
index 46516c1b06..e35ee3325b 100644
--- a/cuda_core/tests/helpers/latch.py
+++ b/cuda_core/tests/helpers/latch.py
@@ -4,7 +4,7 @@
 import ctypes
 
 import pytest
-from cuda.core.experimental import (
+from cuda.core import (
     LaunchConfig,
     LegacyPinnedMemoryResource,
     Program,
diff --git a/cuda_core/tests/helpers/nanosleep_kernel.py b/cuda_core/tests/helpers/nanosleep_kernel.py
index ea6ae34dcf..99d32c9aa4 100644
--- a/cuda_core/tests/helpers/nanosleep_kernel.py
+++ b/cuda_core/tests/helpers/nanosleep_kernel.py
@@ -1,7 +1,7 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.core.experimental import (
+from cuda.core import (
     LaunchConfig,
     Program,
     ProgramOptions,
diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py
index d6280ae0ec..ccb3d3b7cc 100644
--- a/cuda_core/tests/memory_ipc/test_errors.py
+++ b/cuda_core/tests/memory_ipc/test_errors.py
@@ -5,8 +5,8 @@
 import pickle
 import re
 
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
-from cuda.core.experimental._utils.cuda_utils import CUDAError
+from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core._utils.cuda_utils import CUDAError
 
 CHILD_TIMEOUT_SEC = 20
 NBYTES = 64
diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py
index ce756cba21..e4b486e6e1 100644
--- a/cuda_core/tests/memory_ipc/test_event_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_event_ipc.py
@@ -4,7 +4,7 @@
 import multiprocessing as mp
 
 import pytest
-from cuda.core.experimental import Device, EventOptions
+from cuda.core import Device, EventOptions
 from helpers.buffers import compare_equal_buffers, make_scratch_buffer
 from helpers.latch import LatchKernel
 from helpers.logging import TimestampedLogger
diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py
index 54d8056865..d92a28ab5a 100644
--- a/cuda_core/tests/memory_ipc/test_memory_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py
@@ -3,7 +3,7 @@
 
 import multiprocessing as mp
 
-from cuda.core.experimental import Buffer, DeviceMemoryResource
+from cuda.core import Buffer, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py
index 87dc459ffc..5a06133c9b 100644
--- a/cuda_core/tests/memory_ipc/test_peer_access.py
+++ b/cuda_core/tests/memory_ipc/test_peer_access.py
@@ -4,8 +4,8 @@
 import multiprocessing as mp
 
 import pytest
-from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
-from cuda.core.experimental._utils.cuda_utils import CUDAError
+from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core._utils.cuda_utils import CUDAError
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py
index 3493828c7e..2df3fe1bbc 100644
--- a/cuda_core/tests/memory_ipc/test_send_buffers.py
+++ b/cuda_core/tests/memory_ipc/test_send_buffers.py
@@ -5,7 +5,7 @@
 from itertools import cycle
 
 import pytest
-from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core import Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 7fe65b2b4a..66c5f4da06 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -5,7 +5,7 @@
 import multiprocessing.reduction
 import os
 
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
+from cuda.core import Buffer, Device, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py
index 3f3f46cd27..b13b9896a1 100644
--- a/cuda_core/tests/memory_ipc/test_workerpool.py
+++ b/cuda_core/tests/memory_ipc/test_workerpool.py
@@ -6,7 +6,7 @@
 from itertools import cycle
 
 import pytest
-from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
+from cuda.core import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions
 from helpers.buffers import PatternGen
 
 CHILD_TIMEOUT_SEC = 20
diff --git a/cuda_core/tests/test_comparable.py b/cuda_core/tests/test_comparable.py
index c99963cd23..a93e49e4e8 100644
--- a/cuda_core/tests/test_comparable.py
+++ b/cuda_core/tests/test_comparable.py
@@ -8,10 +8,10 @@
 across Device, Stream, Event, and Context objects.
 """
 
-from cuda.core.experimental import Device, Stream
-from cuda.core.experimental._context import Context
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._stream import StreamOptions
+from cuda.core import Device, Stream
+from cuda.core._context import Context
+from cuda.core._event import Event, EventOptions
+from cuda.core._stream import StreamOptions
 
 # ============================================================================
 # Equality Contract Tests
diff --git a/cuda_core/tests/test_context.py b/cuda_core/tests/test_context.py
index 4fe35dc18d..5183aa1a85 100644
--- a/cuda_core/tests/test_context.py
+++ b/cuda_core/tests/test_context.py
@@ -1,14 +1,14 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import cuda.core.experimental
+import cuda.core
 import pytest
-from cuda.core.experimental import Device
+from cuda.core import Device
 
 
 def test_context_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Context objects cannot be instantiated directly\."):
-        cuda.core.experimental._context.Context()  # Ensure back door is locked.
+        cuda.core._context.Context()  # Ensure back door is locked.
 
 
 # ============================================================================
diff --git a/cuda_core/tests/test_cuda_utils.py b/cuda_core/tests/test_cuda_utils.py
index b0a0518652..c68f8fb841 100644
--- a/cuda_core/tests/test_cuda_utils.py
+++ b/cuda_core/tests/test_cuda_utils.py
@@ -4,7 +4,7 @@
 
 import pytest
 from cuda.bindings import driver, runtime
-from cuda.core.experimental._utils import cuda_utils
+from cuda.core._utils import cuda_utils
 
 
 def test_driver_cu_result_explanations_health():
diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py
index fa484fa65b..69849b1a2e 100644
--- a/cuda_core/tests/test_device.py
+++ b/cuda_core/tests/test_device.py
@@ -6,15 +6,15 @@
 except ImportError:
     from cuda import cuda as driver
     from cuda import cudart as runtime
-import cuda.core.experimental
+import cuda.core
 import pytest
-from cuda.core.experimental import Device
-from cuda.core.experimental._utils.cuda_utils import ComputeCapability, get_binding_version, handle_return
+from cuda.core import Device
+from cuda.core._utils.cuda_utils import ComputeCapability, get_binding_version, handle_return
 
 
 def test_device_init_disabled():
     with pytest.raises(RuntimeError, match=r"^DeviceProperties cannot be instantiated directly\."):
-        cuda.core.experimental._device.DeviceProperties()  # Ensure back door is locked.
+        cuda.core._device.DeviceProperties()  # Ensure back door is locked.
 
 
 @pytest.fixture(scope="module")
@@ -48,7 +48,7 @@ def test_device_alloc(deinit_cuda):
 
 
 def test_device_id(deinit_cuda):
-    for device in cuda.core.experimental.system.devices:
+    for device in cuda.core.system.devices:
         device.set_current()
         assert device.device_id == handle_return(runtime.cudaGetDevice())
 
diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py
index ec35448619..0d8f3a3c2d 100644
--- a/cuda_core/tests/test_event.py
+++ b/cuda_core/tests/test_event.py
@@ -4,9 +4,9 @@
 
 import math
 
-import cuda.core.experimental
+import cuda.core
 import pytest
-from cuda.core.experimental import (
+from cuda.core import (
     Device,
     Event,
     EventOptions,
@@ -17,7 +17,7 @@
 
 def test_event_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."):
-        cuda.core.experimental._event.Event()  # Ensure back door is locked.
+        cuda.core._event.Event()  # Ensure back door is locked.
 
 
 def test_timing_success(init_cuda):
diff --git a/cuda_core/tests/test_experimental_backward_compat.py b/cuda_core/tests/test_experimental_backward_compat.py
new file mode 100644
index 0000000000..4ebeb8025d
--- /dev/null
+++ b/cuda_core/tests/test_experimental_backward_compat.py
@@ -0,0 +1,164 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+"""
+Tests for backward compatibility of cuda.core.experimental namespace.
+
+These tests verify that the experimental namespace forwarding stubs work
+correctly and emit appropriate deprecation warnings.
+"""
+
+import pytest
+
+
+# Test that experimental imports still work
+def test_experimental_imports_work():
+    """Test that imports from experimental namespace still work."""
+    # Clear cached module to ensure warning is emitted
+    import sys
+
+    if "cuda.core.experimental" in sys.modules:
+        del sys.modules["cuda.core.experimental"]
+
+    # Test main module import - should emit deprecation warning
+    with pytest.deprecated_call():
+        import cuda.core.experimental
+
+    # Test that symbols are accessible
+    assert hasattr(cuda.core.experimental, "Device")
+    assert hasattr(cuda.core.experimental, "Stream")
+    assert hasattr(cuda.core.experimental, "Buffer")
+    assert hasattr(cuda.core.experimental, "system")
+
+
+def test_experimental_symbols_are_same_objects():
+    """Test that experimental namespace symbols are the same objects as core."""
+    import cuda.core
+    import cuda.core.experimental
+
+    # Compare classes/types
+    assert cuda.core.experimental.Device is cuda.core.Device
+    assert cuda.core.experimental.Stream is cuda.core.Stream
+    assert cuda.core.experimental.Buffer is cuda.core.Buffer
+    assert cuda.core.experimental.MemoryResource is cuda.core.MemoryResource
+    assert cuda.core.experimental.Program is cuda.core.Program
+    assert cuda.core.experimental.Kernel is cuda.core.Kernel
+    assert cuda.core.experimental.ObjectCode is cuda.core.ObjectCode
+    assert cuda.core.experimental.Graph is cuda.core.Graph
+    assert cuda.core.experimental.GraphBuilder is cuda.core.GraphBuilder
+    assert cuda.core.experimental.Event is cuda.core.Event
+    assert cuda.core.experimental.Linker is cuda.core.Linker
+
+    # Compare singletons
+    assert cuda.core.experimental.system is cuda.core.system
+
+
+def test_experimental_direct_imports():
+    """Test that direct imports from experimental submodules work."""
+    # Clear any cached imports to ensure warnings are emitted
+    import sys
+
+    if "cuda.core.experimental" in sys.modules:
+        del sys.modules["cuda.core.experimental"]
+
+    # Test various import patterns - warning is emitted once at module import time
+    with pytest.deprecated_call():
+        from cuda.core.experimental import (
+            Buffer,
+            Device,
+            Stream,
+        )
+
+    # Verify objects are usable
+    assert Device is not None
+    assert Stream is not None
+    assert Buffer is not None
+
+
+def test_experimental_submodule_access():
+    """Test that accessing experimental submodules works."""
+    import cuda.core.experimental
+
+    # Test that submodules can be accessed (via __getattr__)
+    # Note: These may not exist as actual modules, but the forwarding should work
+    try:
+        # This should trigger __getattr__ and forward to the new location
+        _ = cuda.core.experimental._device
+        _ = cuda.core.experimental._stream
+        _ = cuda.core.experimental._memory
+    except AttributeError:
+        # It's okay if submodules aren't directly accessible
+        # The important thing is that public symbols work
+        pass
+
+
+def test_experimental_utils_module():
+    """Test that experimental.utils module works.
+
+    Note: The deprecation warning is only emitted once at import time when
+    cuda.core.experimental is first imported. Accessing utils or importing
+    from utils does not trigger additional warnings since utils is already
+    set as an attribute in the module namespace.
+    """
+    import cuda.core.experimental
+
+    # Should be able to access utils (no warning on access, only on initial import)
+    assert hasattr(cuda.core.experimental, "utils")
+    assert cuda.core.experimental.utils is not None
+
+    # Should have expected utilities (no warning on import from utils submodule)
+    from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
+
+    assert StridedMemoryView is not None
+    assert args_viewable_as_strided_memory is not None
+
+
+def test_experimental_options_classes():
+    """Test that options classes are accessible."""
+    import cuda.core.experimental
+
+    assert hasattr(cuda.core.experimental, "EventOptions")
+    assert hasattr(cuda.core.experimental, "StreamOptions")
+    assert hasattr(cuda.core.experimental, "LaunchConfig")
+    assert hasattr(cuda.core.experimental, "ProgramOptions")
+    assert hasattr(cuda.core.experimental, "LinkerOptions")
+    assert hasattr(cuda.core.experimental, "GraphCompleteOptions")
+    assert hasattr(cuda.core.experimental, "GraphDebugPrintOptions")
+    assert hasattr(cuda.core.experimental, "DeviceMemoryResourceOptions")
+    assert hasattr(cuda.core.experimental, "VirtualMemoryResourceOptions")
+
+    # Verify they're the same objects
+    assert cuda.core.experimental.EventOptions is cuda.core.EventOptions
+    assert cuda.core.experimental.StreamOptions is cuda.core.StreamOptions
+    assert cuda.core.experimental.LaunchConfig is cuda.core.LaunchConfig
+
+
+def test_experimental_memory_classes():
+    """Test that memory-related classes are accessible."""
+    import cuda.core.experimental
+
+    assert hasattr(cuda.core.experimental, "MemoryResource")
+    assert hasattr(cuda.core.experimental, "DeviceMemoryResource")
+    assert hasattr(cuda.core.experimental, "LegacyPinnedMemoryResource")
+    assert hasattr(cuda.core.experimental, "VirtualMemoryResource")
+    assert hasattr(cuda.core.experimental, "GraphMemoryResource")
+
+    # Verify they're the same objects
+    assert cuda.core.experimental.MemoryResource is cuda.core.MemoryResource
+    assert cuda.core.experimental.DeviceMemoryResource is cuda.core.DeviceMemoryResource
+
+
+@pytest.mark.filterwarnings("ignore::DeprecationWarning")
+def test_experimental_instantiations():
+    """Test that objects can be instantiated through experimental namespace."""
+    from cuda.core.experimental import Device
+
+    # Should be able to create objects
+    device = Device()
+    assert device is not None
+
+    # Verify it's the same type
+    from cuda.core import Device as CoreDevice
+
+    assert isinstance(device, CoreDevice)
diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py
index e988eeebf6..aaad9304f4 100644
--- a/cuda_core/tests/test_graph.py
+++ b/cuda_core/tests/test_graph.py
@@ -11,7 +11,7 @@
     from cuda.bindings import nvrtc
 except ImportError:
     from cuda import nvrtc
-from cuda.core.experimental import (
+from cuda.core import (
     Device,
     GraphBuilder,
     GraphCompleteOptions,
@@ -22,7 +22,7 @@
     ProgramOptions,
     launch,
 )
-from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return
+from cuda.core._utils.cuda_utils import NVRTCError, handle_return
 
 
 def _common_kernels():
diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py
index 964ce03b93..c68108ba51 100644
--- a/cuda_core/tests/test_graph_mem.py
+++ b/cuda_core/tests/test_graph_mem.py
@@ -3,7 +3,7 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import pytest
-from cuda.core.experimental import (
+from cuda.core import (
     Device,
     DeviceMemoryResource,
     GraphCompleteOptions,
diff --git a/cuda_core/tests/test_hashable.py b/cuda_core/tests/test_hashable.py
index 4aa801866f..9bc89969a2 100644
--- a/cuda_core/tests/test_hashable.py
+++ b/cuda_core/tests/test_hashable.py
@@ -12,10 +12,10 @@
 5. Hash/equality contract compliance (if a == b, then hash(a) must equal hash(b))
 """
 
-from cuda.core.experimental import Device
-from cuda.core.experimental._context import Context
-from cuda.core.experimental._event import Event, EventOptions
-from cuda.core.experimental._stream import Stream, StreamOptions
+from cuda.core import Device
+from cuda.core._context import Context
+from cuda.core._event import Event, EventOptions
+from cuda.core._stream import Stream, StreamOptions
 
 # ============================================================================
 # Integration Tests
diff --git a/cuda_core/tests/test_helpers.py b/cuda_core/tests/test_helpers.py
index 65df23980c..8230f08088 100644
--- a/cuda_core/tests/test_helpers.py
+++ b/cuda_core/tests/test_helpers.py
@@ -5,7 +5,7 @@
 import time
 
 import pytest
-from cuda.core.experimental import Device
+from cuda.core import Device
 from helpers import IS_WINDOWS, IS_WSL
 from helpers.buffers import PatternGen, compare_equal_buffers, make_scratch_buffer
 from helpers.latch import LatchKernel
diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py
index d2e0a89a28..ae3e5531c1 100644
--- a/cuda_core/tests/test_launcher.py
+++ b/cuda_core/tests/test_launcher.py
@@ -12,7 +12,7 @@
     cp = None
 import numpy as np
 import pytest
-from cuda.core.experimental import (
+from cuda.core import (
     Device,
     DeviceMemoryResource,
     LaunchConfig,
@@ -21,8 +21,8 @@
     ProgramOptions,
     launch,
 )
-from cuda.core.experimental._memory import _SynchronousMemoryResource
-from cuda.core.experimental._utils.cuda_utils import CUDAError
+from cuda.core._memory import _SynchronousMemoryResource
+from cuda.core._utils.cuda_utils import CUDAError
 
 from conftest import skipif_need_cuda_headers
 
@@ -95,7 +95,7 @@ def test_launch_config_cluster_grid_conversion(init_cuda):
 
 def test_launch_config_native_conversion(init_cuda):
     """Test that _to_native_launch_config correctly converts grid from cluster units to block units."""
-    from cuda.core.experimental._launch_config import _to_native_launch_config
+    from cuda.core._launch_config import _to_native_launch_config
 
     try:
         # Test case 1: 1D - Issue #867 example
@@ -264,7 +264,7 @@ def test_cooperative_launch():
     # # Commented out as this seems to be a sticky error...
     # config = LaunchConfig(grid=1, block=1)
     # launch(s, config, ker)
-    # from cuda.core.experimental._utils.cuda_utils import CUDAError
+    # from cuda.core._utils.cuda_utils import CUDAError
     # with pytest.raises(CUDAError) as e:
     #     s.sync()
     # assert "CUDA_ERROR_LAUNCH_FAILED" in str(e)
diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py
index e0c8d37b65..ad68201456 100644
--- a/cuda_core/tests/test_linker.py
+++ b/cuda_core/tests/test_linker.py
@@ -3,9 +3,9 @@
 # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
 
 import pytest
-from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker
-from cuda.core.experimental._module import ObjectCode
-from cuda.core.experimental._utils.cuda_utils import CUDAError
+from cuda.core import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker
+from cuda.core._module import ObjectCode
+from cuda.core._utils.cuda_utils import CUDAError
 
 ARCH = "sm_" + "".join(f"{i}" for i in Device().compute_capability)
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index be46802493..1e198334f8 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -17,7 +17,7 @@
 import re
 
 import pytest
-from cuda.core.experimental import (
+from cuda.core import (
     Buffer,
     Device,
     DeviceMemoryResource,
@@ -27,13 +27,13 @@
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
-from cuda.core.experimental import (
+from cuda.core import (
     system as ccx_system,
 )
-from cuda.core.experimental._dlpack import DLDeviceType
-from cuda.core.experimental._memory import IPCBufferDescriptor
-from cuda.core.experimental._utils.cuda_utils import CUDAError, handle_return
-from cuda.core.experimental.utils import StridedMemoryView
+from cuda.core._dlpack import DLDeviceType
+from cuda.core._memory import IPCBufferDescriptor
+from cuda.core._utils.cuda_utils import CUDAError, handle_return
+from cuda.core.utils import StridedMemoryView
 from helpers import IS_WINDOWS
 from helpers.buffers import DummyUnifiedMemoryResource
 
@@ -136,7 +136,7 @@ def test_package_contents():
         "VirtualMemoryResource",
     ]
     d = {}
-    exec("from cuda.core.experimental._memory import *", d)  # noqa: S102
+    exec("from cuda.core._memory import *", d)  # noqa: S102
     d = {k: v for k, v in d.items() if not k.startswith("__")}
     assert sorted(expected) == sorted(d.keys())
 
diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py
index 66c2af23f1..7f48c85089 100644
--- a/cuda_core/tests/test_memory_peer_access.py
+++ b/cuda_core/tests/test_memory_peer_access.py
@@ -1,139 +1,38 @@
 # SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import cuda.core.experimental
+import cuda.core
 import pytest
-from cuda.core.experimental import DeviceMemoryResource
-from cuda.core.experimental._utils.cuda_utils import CUDAError
-from helpers.buffers import PatternGen, compare_buffer_to_constant, make_scratch_buffer
+from cuda.core import Device
 
 NBYTES = 1024
 
 
-def test_peer_access_basic(mempool_device_x2):
-    """Basic tests for dmr.peer_accessible_by."""
-    dev0, dev1 = mempool_device_x2
-    zero_on_dev0 = make_scratch_buffer(dev0, 0, NBYTES)
-    one_on_dev0 = make_scratch_buffer(dev0, 1, NBYTES)
-    stream_on_dev0 = dev0.create_stream()
-    dmr_on_dev1 = DeviceMemoryResource(dev1)
-    buf_on_dev1 = dmr_on_dev1.allocate(NBYTES)
+def _mempool_device_impl(num):
+    num_devices = len(cuda.core.system.devices)
+    if num_devices < num:
+        pytest.skip("Test requires at least {num} GPUs")
 
-    # No access at first.
-    assert 0 not in dmr_on_dev1.peer_accessible_by
-    with pytest.raises(CUDAError, match="CUDA_ERROR_INVALID_VALUE"):
-        one_on_dev0.copy_to(buf_on_dev1, stream=stream_on_dev0)
+    devs = [Device(i) for i in range(num)]
+    for i in reversed(range(num)):
+        devs[i].set_current()
 
-    with pytest.raises(CUDAError, match="CUDA_ERROR_INVALID_VALUE"):
-        zero_on_dev0.copy_from(buf_on_dev1, stream=stream_on_dev0)
+    if not all(devs[i].can_access_peer(j) for i in range(num) for j in range(num)):
+        pytest.skip("Test requires GPUs with peer access")
 
-    # Allow access to device 1's allocations from device 0.
-    dmr_on_dev1.peer_accessible_by = [dev0]
-    assert 0 in dmr_on_dev1.peer_accessible_by
-    compare_buffer_to_constant(zero_on_dev0, 0)
-    one_on_dev0.copy_to(buf_on_dev1, stream=stream_on_dev0)
-    zero_on_dev0.copy_from(buf_on_dev1, stream=stream_on_dev0)
-    stream_on_dev0.sync()
-    compare_buffer_to_constant(zero_on_dev0, 1)
+    if not all(devs[i].properties.memory_pools_supported for i in range(num)):
+        pytest.skip("Device does not support mempool operations")
 
-    # Revoke access
-    dmr_on_dev1.peer_accessible_by = []
-    assert 0 not in dmr_on_dev1.peer_accessible_by
-    with pytest.raises(CUDAError, match="CUDA_ERROR_INVALID_VALUE"):
-        one_on_dev0.copy_to(buf_on_dev1, stream=stream_on_dev0)
+    return devs
 
-    with pytest.raises(CUDAError, match="CUDA_ERROR_INVALID_VALUE"):
-        zero_on_dev0.copy_from(buf_on_dev1, stream=stream_on_dev0)
 
+@pytest.fixture
+def mempool_device_x2():
+    """Fixture that provides two devices if available, otherwise skips test."""
+    return _mempool_device_impl(2)
 
-def test_peer_access_property_x2(mempool_device_x2):
-    """The the dmr.peer_accessible_by property (but not its functionality)."""
-    # The peer access list is a sorted tuple and always excludes the self
-    # device.
-    dev0, dev1 = mempool_device_x2
-    dmr = DeviceMemoryResource(dev0)
 
-    def check(expected):
-        assert isinstance(dmr.peer_accessible_by, tuple)
-        assert dmr.peer_accessible_by == expected
-
-    # No access to begin with.
-    check(expected=())
-    # fmt: off
-    dmr.peer_accessible_by = (0,)            ; check(expected=())    # noqa: E702
-    dmr.peer_accessible_by = (1,)            ; check(expected=(1,))  # noqa: E702
-    dmr.peer_accessible_by = (0, 1)          ; check(expected=(1,))  # noqa: E702
-    dmr.peer_accessible_by = ()              ; check(expected=())    # noqa: E702
-    dmr.peer_accessible_by = [0, 1]          ; check(expected=(1,))  # noqa: E702
-    dmr.peer_accessible_by = set()           ; check(expected=())    # noqa: E702
-    dmr.peer_accessible_by = [1, 1, 1, 1, 1] ; check(expected=(1,))  # noqa: E702
-    # fmt: on
-
-    with pytest.raises(ValueError, match=r"device_id must be \>\= 0"):
-        dmr.peer_accessible_by = [-1]  # device ID out of bounds
-
-    num_devices = len(cuda.core.experimental.system.devices)
-
-    with pytest.raises(ValueError, match=r"device_id must be within \[0, \d+\)"):
-        dmr.peer_accessible_by = [num_devices]  # device ID out of bounds
-
-
-def test_peer_access_transitions(mempool_device_x3):
-    """Advanced tests for dmr.peer_accessible_by."""
-
-    # Check all transitions between peer access states. The implementation
-    # performs transactions that add or remove access as needed. This test
-    # ensures that that is working as expected.
-
-    # Doing everything from the point-of-view of device 0, there are four
-    # access states:
-    #
-    #     [(), (1,), (2,), (1, 2)]
-    #
-    # and 4^2-4 = 12 non-identity transitions.
-
-    devs = mempool_device_x3  # Three devices
-
-    # Allocate per-device resources.
-    streams = [dev.create_stream() for dev in devs]
-    pgens = [PatternGen(devs[i], NBYTES, streams[i]) for i in range(3)]
-    dmrs = [DeviceMemoryResource(dev) for dev in devs]
-    bufs = [dmr.allocate(NBYTES) for dmr in dmrs]
-
-    def verify_state(state, pattern_seed):
-        """
-        Verify an access state from the POV of device 0. E.g., (1,) means
-        device 1 has access but device 2 does not.
-        """
-        # Populate device 0's buffer with a new pattern.
-        devs[0].set_current()
-        pgens[0].fill_buffer(bufs[0], seed=pattern_seed)
-        streams[0].sync()
-
-        for peer in [1, 2]:
-            devs[peer].set_current()
-            if peer in state:
-                # Peer device has access to 0's allocation
-                bufs[peer].copy_from(bufs[0], stream=streams[peer])
-                # Check the result on the peer device.
-                pgens[peer].verify_buffer(bufs[peer], seed=pattern_seed)
-            else:
-                # Peer device has no access to 0's allocation
-                with pytest.raises(CUDAError, match="CUDA_ERROR_INVALID_VALUE"):
-                    bufs[peer].copy_from(bufs[0], stream=streams[peer])
-
-    # For each transition, set the access state before and after, checking for
-    # the expected peer access capabilities at each stop.
-    pattern_seed = 0
-    states = [(), (1,), (2,), (1, 2)]
-    transitions = [(s0, s1) for s0 in states for s1 in states if s0 != s1]
-    for init_state, final_state in transitions:
-        dmrs[0].peer_accessible_by = init_state
-        assert dmrs[0].peer_accessible_by == init_state
-        verify_state(init_state, pattern_seed)
-        pattern_seed += 1
-
-        dmrs[0].peer_accessible_by = final_state
-        assert dmrs[0].peer_accessible_by == final_state
-        verify_state(final_state, pattern_seed)
-        pattern_seed += 1
+@pytest.fixture
+def mempool_device_x3():
+    """Fixture that provides three devices if available, otherwise skips test."""
+    return _mempool_device_impl(3)
diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py
index 901a57f7a4..a1934834fc 100644
--- a/cuda_core/tests/test_module.py
+++ b/cuda_core/tests/test_module.py
@@ -5,10 +5,10 @@
 import pickle
 import warnings
 
-import cuda.core.experimental
+import cuda.core
 import pytest
-from cuda.core.experimental import Device, ObjectCode, Program, ProgramOptions, system
-from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
+from cuda.core import Device, ObjectCode, Program, ProgramOptions, system
+from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return
 
 try:
     import numba
@@ -41,17 +41,17 @@ def cuda12_4_prerequisite_check():
 
 def test_kernel_attributes_init_disabled():
     with pytest.raises(RuntimeError, match=r"^KernelAttributes cannot be instantiated directly\."):
-        cuda.core.experimental._module.KernelAttributes()  # Ensure back door is locked.
+        cuda.core._module.KernelAttributes()  # Ensure back door is locked.
 
 
 def test_kernel_occupancy_init_disabled():
     with pytest.raises(RuntimeError, match=r"^KernelOccupancy cannot be instantiated directly\."):
-        cuda.core.experimental._module.KernelOccupancy()  # Ensure back door is locked.
+        cuda.core._module.KernelOccupancy()  # Ensure back door is locked.
 
 
 def test_kernel_init_disabled():
     with pytest.raises(RuntimeError, match=r"^Kernel objects cannot be instantiated directly\."):
-        cuda.core.experimental._module.Kernel()  # Ensure back door is locked.
+        cuda.core._module.Kernel()  # Ensure back door is locked.
 
 
 def test_object_code_init_disabled():
@@ -387,7 +387,7 @@ def test_occupancy_max_active_clusters(get_saxpy_kernel_cubin, cluster):
     dev = Device()
     if dev.compute_capability < (9, 0):
         pytest.skip("Device with compute capability 90 or higher is required for cluster support")
-    launch_config = cuda.core.experimental.LaunchConfig(grid=128, block=64, cluster=cluster)
+    launch_config = cuda.core.LaunchConfig(grid=128, block=64, cluster=cluster)
     query_fn = kernel.occupancy.max_active_clusters
     max_active_clusters = query_fn(launch_config)
     assert isinstance(max_active_clusters, int)
@@ -402,7 +402,7 @@ def test_occupancy_max_potential_cluster_size(get_saxpy_kernel_cubin):
     dev = Device()
     if dev.compute_capability < (9, 0):
         pytest.skip("Device with compute capability 90 or higher is required for cluster support")
-    launch_config = cuda.core.experimental.LaunchConfig(grid=128, block=64)
+    launch_config = cuda.core.LaunchConfig(grid=128, block=64)
     query_fn = kernel.occupancy.max_potential_cluster_size
     max_potential_cluster_size = query_fn(launch_config)
     assert isinstance(max_potential_cluster_size, int)
diff --git a/cuda_core/tests/test_multiprocessing_warning.py b/cuda_core/tests/test_multiprocessing_warning.py
index 945ea83964..214fa360b2 100644
--- a/cuda_core/tests/test_multiprocessing_warning.py
+++ b/cuda_core/tests/test_multiprocessing_warning.py
@@ -12,13 +12,13 @@
 import warnings
 from unittest.mock import patch
 
-from cuda.core.experimental import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions
-from cuda.core.experimental._event import _reduce_event
-from cuda.core.experimental._memory._ipc import (
+from cuda.core import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions
+from cuda.core._event import _reduce_event
+from cuda.core._memory._ipc import (
     _deep_reduce_device_memory_resource,
     _reduce_allocation_handle,
 )
-from cuda.core.experimental._utils.cuda_utils import reset_fork_warning
+from cuda.core._utils.cuda_utils import reset_fork_warning
 
 
 def test_warn_on_fork_method_device_memory_resource(ipc_device):
diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py
index 8a6526fcc2..e85fd690d9 100644
--- a/cuda_core/tests/test_program.py
+++ b/cuda_core/tests/test_program.py
@@ -6,10 +6,10 @@
 import warnings
 
 import pytest
-from cuda.core.experimental import _linker
-from cuda.core.experimental._module import Kernel, ObjectCode
-from cuda.core.experimental._program import Program, ProgramOptions
-from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, handle_return
+from cuda.core import _linker
+from cuda.core._module import Kernel, ObjectCode
+from cuda.core._program import Program, ProgramOptions
+from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return
 
 cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 is_culink_backend = _linker._decide_nvjitlink_or_driver()
@@ -18,7 +18,7 @@
 def _is_nvvm_available():
     """Check if NVVM is available."""
     try:
-        from cuda.core.experimental._program import _get_nvvm_module
+        from cuda.core._program import _get_nvvm_module
 
         _get_nvvm_module()
         return True
@@ -31,7 +31,7 @@ def _is_nvvm_available():
 )
 
 try:
-    from cuda.core.experimental._utils.cuda_utils import driver, handle_return, nvrtc
+    from cuda.core._utils.cuda_utils import driver, handle_return, nvrtc
 
     _cuda_driver_version = handle_return(driver.cuDriverGetVersion())
 except Exception:
@@ -91,7 +91,7 @@ def _get_libnvvm_version_for_tests():
     _libnvvm_version_attempted = True
 
     try:
-        from cuda.core.experimental._program import _get_nvvm_module
+        from cuda.core._program import _get_nvvm_module
 
         nvvm = _get_nvvm_module()
 
@@ -139,7 +139,7 @@ def nvvm_ir():
     fallback assumes no version metadata will be present in
     the input nvvm ir
     """
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     major, minor, debug_major, debug_minor = nvvm.ir_version()
@@ -329,7 +329,7 @@ def test_program_close():
 @nvvm_available
 def test_nvvm_deferred_import():
     """Test that our deferred NVVM import works correctly"""
-    from cuda.core.experimental._program import _get_nvvm_module
+    from cuda.core._program import _get_nvvm_module
 
     nvvm = _get_nvvm_module()
     assert nvvm is not None
diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py
index 695a70e931..01b0b861af 100644
--- a/cuda_core/tests/test_stream.py
+++ b/cuda_core/tests/test_stream.py
@@ -2,10 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
-from cuda.core.experimental import Device, Stream, StreamOptions
-from cuda.core.experimental._event import Event
-from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
-from cuda.core.experimental._utils.cuda_utils import driver
+from cuda.core import Device, Stream, StreamOptions
+from cuda.core._event import Event
+from cuda.core._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM
+from cuda.core._utils.cuda_utils import driver
 from helpers.misc import StreamWrapper
 
 
diff --git a/cuda_core/tests/test_strided_layout.py b/cuda_core/tests/test_strided_layout.py
index a0f63f7aaa..f2baaa03d3 100644
--- a/cuda_core/tests/test_strided_layout.py
+++ b/cuda_core/tests/test_strided_layout.py
@@ -9,7 +9,7 @@
 
 import numpy as np
 import pytest
-from cuda.core.experimental._layout import StridedLayout
+from cuda.core._layout import StridedLayout
 from helpers.layout import (
     DenseOrder,
     LayoutSpec,
diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py
index d5195ed872..fb39d018c3 100644
--- a/cuda_core/tests/test_system.py
+++ b/cuda_core/tests/test_system.py
@@ -7,8 +7,8 @@
     from cuda import cuda as driver
     from cuda import cudart as runtime
 
-from cuda.core.experimental import Device, system
-from cuda.core.experimental._utils.cuda_utils import handle_return
+from cuda.core import Device, system
+from cuda.core._utils.cuda_utils import handle_return
 
 
 def test_system_singleton():
diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py
index 927d7bc239..502adacc89 100644
--- a/cuda_core/tests/test_utils.py
+++ b/cuda_core/tests/test_utils.py
@@ -12,15 +12,16 @@
     from numba import cuda as numba_cuda
 except ImportError:
     numba_cuda = None
-import cuda.core.experimental
+import cuda.core
 import numpy as np
 import pytest
-from cuda.core.experimental import Device
-from cuda.core.experimental.utils import StridedLayout, StridedMemoryView, args_viewable_as_strided_memory
+from cuda.core import Device
+from cuda.core._layout import StridedLayout
+from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 
 def test_cast_to_3_tuple_success():
-    c3t = cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple
+    c3t = cuda.core._utils.cuda_utils.cast_to_3_tuple
     assert c3t("", ()) == (1, 1, 1)
     assert c3t("", 2) == (2, 1, 1)
     assert c3t("", (2,)) == (2, 1, 1)
@@ -44,7 +45,7 @@ def test_cast_to_3_tuple_success():
 )
 def test_cast_to_3_tuple_value_error(cfg, expected):
     with pytest.raises(ValueError, match=expected):
-        cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple("Lbl", cfg)
+        cuda.core._utils.cuda_utils.cast_to_3_tuple("Lbl", cfg)
 
 
 def convert_strides_to_counts(strides, itemsize):
diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/__init__.py b/cuda_python_test_helpers/cuda_python_test_helpers/__init__.py
index a661b4f1aa..e7829df406 100644
--- a/cuda_python_test_helpers/cuda_python_test_helpers/__init__.py
+++ b/cuda_python_test_helpers/cuda_python_test_helpers/__init__.py
@@ -9,7 +9,7 @@
 from contextlib import suppress
 from typing import Union
 
-from cuda.core.experimental._utils.cuda_utils import handle_return
+from cuda.core._utils.cuda_utils import handle_return
 
 __all__ = [
     "IS_WINDOWS",