diff --git a/.github/ISSUE_TEMPLATE/bug_report.yml b/.github/ISSUE_TEMPLATE/bug_report.yml index 4574e04bf3..1e0b4d2c20 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.yml +++ b/.github/ISSUE_TEMPLATE/bug_report.yml @@ -52,7 +52,7 @@ body: attributes: label: Describe the bug description: A clear and concise description of what problem you are running into. - placeholder: "Attempting to compile a program via `cuda.core.experimental.Program.compile` throws a `ValueError`." + placeholder: "Attempting to compile a program via `cuda.core.Program.compile` throws a `ValueError`." validations: required: true @@ -62,7 +62,7 @@ body: label: How to Reproduce description: Steps used to reproduce the bug. placeholder: | - 0. Construct a `cuda.core.experimental.Program` instance + 0. Construct a `cuda.core.Program` instance 1. Call the `.compile(...)` method of the instance 2. The call throws a `ValueError` with the following: ``` @@ -76,7 +76,7 @@ body: attributes: label: Expected behavior description: A clear and concise description of what you expected to happen. - placeholder: "Using `cuda.core.experimental.Program.compile(...)` should run successfully and not throw a `ValueError`" + placeholder: "Using `cuda.core.Program.compile(...)` should run successfully and not throw a `ValueError`" validations: required: true diff --git a/.github/ISSUE_TEMPLATE/feature_request.yml b/.github/ISSUE_TEMPLATE/feature_request.yml index cbbc03c492..6d1504c4c4 100644 --- a/.github/ISSUE_TEMPLATE/feature_request.yml +++ b/.github/ISSUE_TEMPLATE/feature_request.yml @@ -36,7 +36,7 @@ body: attributes: label: Is your feature request related to a problem? Please describe. description: A clear and concise description of what the problem is, e.g., "I would like to be able to..." - placeholder: I would like to be able to use the equivalent of `cuda.core.experimental.Program.compile(...)` to compile my code to PTX. + placeholder: I would like to be able to use the equivalent of `cuda.core.Program.compile(...)` to compile my code to PTX. validations: required: true @@ -46,7 +46,7 @@ body: label: Describe the solution you'd like description: A clear and concise description of what you want to happen. placeholder: | - Support a `ptx` target_type in the `cuda.core.experimental.Program.compile(...)` function. + Support a `ptx` target_type in the `cuda.core.Program.compile(...)` function. validations: required: true @@ -57,7 +57,7 @@ body: description: If applicable, please add a clear and concise description of any alternative solutions or features you've considered. - placeholder: The alternatives to using `cuda.core.experimental.Program.compile(...)` are unappealing. They usually involve using lower level bindings to something like nvRTC or invoking the nvcc executable. + placeholder: The alternatives to using `cuda.core.Program.compile(...)` are unappealing. They usually involve using lower level bindings to something like nvRTC or invoking the nvcc executable. validations: required: false diff --git a/.spdx-ignore b/.spdx-ignore index 7bbb51dcd5..7263b5414f 100644 --- a/.spdx-ignore +++ b/.spdx-ignore @@ -9,6 +9,6 @@ requirements*.txt cuda_bindings/examples/* # Vendored -cuda_core/cuda/core/experimental/include/dlpack.h +cuda_core/cuda/core/_include/dlpack.h qa/ctk-next.drawio.svg diff --git a/ci/tools/merge_cuda_core_wheels.py b/ci/tools/merge_cuda_core_wheels.py index 359b98d6a7..e3f71ea282 100644 --- a/ci/tools/merge_cuda_core_wheels.py +++ b/ci/tools/merge_cuda_core_wheels.py @@ -12,8 +12,8 @@ In particular, each wheel contains a CUDA-specific build of the `cuda.core` library and the associated bindings. This script merges these directories into a single wheel -that supports both CUDA versions, i.e., containing both `cuda/core/experimental/cu12` -and `cuda/core/experimental/cu13`. At runtime, the code in `cuda/core/experimental/__init__.py` +that supports both CUDA versions, i.e., containing both `cuda/core/cu12` +and `cuda/core/cu13`. At runtime, the code in `cuda/core/__init__.py` is used to import the appropriate CUDA-specific bindings. This script is based on the one in NVIDIA/CCCL. @@ -25,6 +25,7 @@ import subprocess import sys import tempfile +import zipfile from pathlib import Path from typing import List @@ -46,7 +47,38 @@ def run_command(cmd: List[str], cwd: Path = None, env: dict = os.environ) -> sub return result -def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: +def print_wheel_directory_structure(wheel_path: Path, filter_prefix: str = "cuda/core/", label: str = None): + """Print the directory structure of a wheel file, similar to unzip -l output. + + Args: + wheel_path: Path to the wheel file to inspect + filter_prefix: Only show files matching this prefix (default: "cuda/core/") + label: Optional label to print before the structure (e.g., "Input wheel 1: name.whl") + """ + if label: + print(f"\n--- {label} ---", file=sys.stderr) + try: + with zipfile.ZipFile(wheel_path, "r") as zf: + print(f"{'Length':>10} {'Date':>12} {'Time':>8} Name", file=sys.stderr) + print("-" * 80, file=sys.stderr) + total_size = 0 + file_count = 0 + for name in sorted(zf.namelist()): + if filter_prefix in name: + info = zf.getinfo(name) + total_size += info.file_size + file_count += 1 + date_time = info.date_time + date_str = f"{date_time[0]:04d}-{date_time[1]:02d}-{date_time[2]:02d}" + time_str = f"{date_time[3]:02d}:{date_time[4]:02d}:{date_time[5]:02d}" + print(f"{info.file_size:10d} {date_str} {time_str} {name}", file=sys.stderr) + print("-" * 80, file=sys.stderr) + print(f"{total_size:10d} {file_count} files", file=sys.stderr) + except Exception as e: + print(f"Warning: Could not list wheel contents: {e}", file=sys.stderr) + + +def merge_wheels(wheels: List[Path], output_dir: Path, show_wheel_contents: bool = True) -> Path: """Merge multiple wheels into a single wheel with version-specific binaries.""" print("\n=== Merging wheels ===", file=sys.stderr) print(f"Input wheels: {[w.name for w in wheels]}", file=sys.stderr) @@ -91,30 +123,51 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: extracted_wheels.append(extract_dir) + if show_wheel_contents: + print("\n=== Input wheel directory structures ===", file=sys.stderr) + for i, wheel in enumerate(wheels): + print_wheel_directory_structure(wheel, label=f"Input wheel {i + 1}: {wheel.name}") + # Use the first wheel as the base and merge binaries from others base_wheel = extracted_wheels[0] - # now copy the version-specific directory from other wheels - # into the appropriate place in the base wheel + # Copy version-specific directories from each wheel into versioned subdirectories + base_dir = Path("cuda") / "core" + for i, wheel_dir in enumerate(extracted_wheels): cuda_version = wheels[i].name.split(".cu")[1].split(".")[0] - base_dir = Path("cuda") / "core" / "experimental" - # Copy from other wheels - print(f" Copying {wheel_dir} to {base_wheel}", file=sys.stderr) - shutil.copytree(wheel_dir / base_dir, base_wheel / base_dir / f"cu{cuda_version}") - - # Overwrite the __init__.py in versioned dirs - os.truncate(base_wheel / base_dir / f"cu{cuda_version}" / "__init__.py", 0) - - # The base dir should only contain __init__.py, the include dir, and the versioned dirs - files_to_remove = os.scandir(base_wheel / base_dir) - for f in files_to_remove: + versioned_dir = base_wheel / base_dir / f"cu{cuda_version}" + + # Copy entire directory tree from source wheel to versioned directory + print(f" Copying {wheel_dir / base_dir} to {versioned_dir}", file=sys.stderr) + shutil.copytree(wheel_dir / base_dir, versioned_dir, dirs_exist_ok=True) + + # Overwrite the __init__.py in versioned dirs to be empty + os.truncate(versioned_dir / "__init__.py", 0) + + print("\n=== Removing files from cuda/core/ directory ===", file=sys.stderr) + items_to_keep = ( + "__init__.py", + "__init__.pxd", + "_version.py", + "_include", + "cu12", + "cu13", + ) + all_items = os.scandir(base_wheel / base_dir) + removed_count = 0 + for f in all_items: f_abspath = f.path - if f.name not in ("__init__.py", "cu12", "cu13", "include"): - if f.is_dir(): - shutil.rmtree(f_abspath) - else: - os.remove(f_abspath) + if f.name in items_to_keep: + continue + if f.is_dir(): + print(f" Removing directory: {f.name}", file=sys.stderr) + shutil.rmtree(f_abspath) + else: + print(f" Removing file: {f.name}", file=sys.stderr) + os.remove(f_abspath) + removed_count += 1 + print(f"Removed {removed_count} items from cuda/core/ directory", file=sys.stderr) # Repack the merged wheel output_dir.mkdir(parents=True, exist_ok=True) @@ -142,6 +195,11 @@ def merge_wheels(wheels: List[Path], output_dir: Path) -> Path: merged_wheel = output_wheels[0] print(f"Successfully merged wheel: {merged_wheel}", file=sys.stderr) + + if show_wheel_contents: + print("\n=== Output wheel directory structure ===", file=sys.stderr) + print_wheel_directory_structure(merged_wheel) + return merged_wheel diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py index 76bf76def5..4337783563 100644 --- a/cuda_core/build_hooks.py +++ b/cuda_core/build_hooks.py @@ -68,7 +68,7 @@ def _build_cuda_core(): # It seems setuptools' wildcard support has problems for namespace packages, # so we explicitly spell out all Extension instances. - root_module = "cuda.core.experimental" + root_module = "cuda.core" root_path = f"{os.path.sep}".join(root_module.split(".")) + os.path.sep ext_files = glob.glob(f"{root_path}/**/*.pyx", recursive=True) @@ -86,6 +86,7 @@ def get_cuda_paths(): print("CUDA paths:", CUDA_PATH) return CUDA_PATH + all_include_dirs = list(os.path.join(root, "include") for root in get_cuda_paths()) extra_compile_args = [] if COMPILE_FOR_COVERAGE: # CYTHON_TRACE_NOGIL indicates to trace nogil functions. It is not @@ -94,9 +95,9 @@ def get_cuda_paths(): ext_modules = tuple( Extension( - f"cuda.core.experimental.{mod.replace(os.path.sep, '.')}", - sources=[f"cuda/core/experimental/{mod}.pyx"], - include_dirs=list(os.path.join(root, "include") for root in get_cuda_paths()), + f"cuda.core.{mod.replace(os.path.sep, '.')}", + sources=[f"cuda/core/{mod}.pyx"], + include_dirs=all_include_dirs, language="c++", extra_compile_args=extra_compile_args, ) diff --git a/cuda_core/cuda/core/__init__.py b/cuda_core/cuda/core/__init__.py index 96a80d1f3e..ae7c93a041 100644 --- a/cuda_core/cuda/core/__init__.py +++ b/cuda_core/cuda/core/__init__.py @@ -3,3 +3,66 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.core._version import __version__ + +try: + from cuda import bindings +except ImportError: + raise ImportError("cuda.bindings 12.x or 13.x must be installed") from None +else: + cuda_major, cuda_minor = bindings.__version__.split(".")[:2] + if cuda_major not in ("12", "13"): + raise ImportError("cuda.bindings 12.x or 13.x must be installed") + +import importlib + +subdir = f"cu{cuda_major}" +try: + versioned_mod = importlib.import_module(f".{subdir}", __package__) + # Import all symbols from the module + globals().update(versioned_mod.__dict__) +except ImportError: + # This is not a wheel build, but a conda or local build, do nothing + pass +else: + del versioned_mod +finally: + del bindings, importlib, subdir, cuda_major, cuda_minor + +from cuda.core._device import Device # noqa: E402 +from cuda.core._event import Event, EventOptions # noqa: E402 +from cuda.core._graph import ( # noqa: E402 + Graph, + GraphBuilder, + GraphCompleteOptions, + GraphDebugPrintOptions, +) +from cuda.core._launch_config import LaunchConfig # noqa: E402 +from cuda.core._launcher import launch # noqa: E402 +from cuda.core._layout import _StridedLayout # noqa: E402 +from cuda.core._linker import Linker, LinkerOptions # noqa: E402 +from cuda.core._memory import ( # noqa: E402 + Buffer, + DeviceMemoryResource, + DeviceMemoryResourceOptions, + GraphMemoryResource, + LegacyPinnedMemoryResource, + ManagedMemoryResource, + ManagedMemoryResourceOptions, + MemoryResource, + PinnedMemoryResource, + PinnedMemoryResourceOptions, + VirtualMemoryResource, + VirtualMemoryResourceOptions, +) +from cuda.core._memoryview import ( # noqa: E402 + StridedMemoryView, # noqa: E402 + args_viewable_as_strided_memory, # noqa: E402 +) +from cuda.core._module import Kernel, ObjectCode # noqa: E402 +from cuda.core._program import Program, ProgramOptions # noqa: E402 +from cuda.core._stream import Stream, StreamOptions # noqa: E402 +from cuda.core._system import System # noqa: E402 + +system = System() +__import__("sys").modules[__spec__.name + ".system"] = system +del System diff --git a/cuda_core/cuda/core/experimental/_context.pyx b/cuda_core/cuda/core/_context.pyx similarity index 94% rename from cuda_core/cuda/core/experimental/_context.pyx rename to cuda_core/cuda/core/_context.pyx index f9858c1710..c1c28b3389 100644 --- a/cuda_core/cuda/core/experimental/_context.pyx +++ b/cuda_core/cuda/core/_context.pyx @@ -4,7 +4,7 @@ from dataclasses import dataclass -from cuda.core.experimental._utils.cuda_utils import driver +from cuda.core._utils.cuda_utils import driver @dataclass diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/_device.pyx similarity index 98% rename from cuda_core/cuda/core/experimental/_device.pyx rename to cuda_core/cuda/core/_device.pyx index b510320f2e..2d775b6580 100644 --- a/cuda_core/cuda/core/experimental/_device.pyx +++ b/cuda_core/cuda/core/_device.pyx @@ -6,27 +6,27 @@ cimport cpython from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver -from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN import threading from typing import Optional, TYPE_CHECKING, Union -from cuda.core.experimental._context import Context, ContextOptions -from cuda.core.experimental._event import Event, EventOptions -from cuda.core.experimental._graph import GraphBuilder -from cuda.core.experimental._stream import IsStreamT, Stream, StreamOptions -from cuda.core.experimental._utils.clear_error_support import assert_type -from cuda.core.experimental._utils.cuda_utils import ( +from cuda.core._context import Context, ContextOptions +from cuda.core._event import Event, EventOptions +from cuda.core._graph import GraphBuilder +from cuda.core._stream import IsStreamT, Stream, StreamOptions +from cuda.core._utils.clear_error_support import assert_type +from cuda.core._utils.cuda_utils import ( ComputeCapability, CUDAError, driver, handle_return, runtime, ) -from cuda.core.experimental._stream cimport default_stream +from cuda.core._stream cimport default_stream if TYPE_CHECKING: - from cuda.core.experimental._memory import Buffer, MemoryResource + from cuda.core._memory import Buffer, MemoryResource # TODO: I prefer to type these as "cdef object" and avoid accessing them from within Python, # but it seems it is very convenient to expose them for testing purposes... @@ -1034,7 +1034,7 @@ class Device: tuple of Device A tuple containing instances of available devices. """ - from cuda.core.experimental import system + from cuda.core import system total = system.get_num_devices() return tuple(cls(device_id) for device_id in range(total)) @@ -1168,17 +1168,17 @@ class Device: ) ) if attr == 1: - from cuda.core.experimental._memory import DeviceMemoryResource + from cuda.core._memory import DeviceMemoryResource self._memory_resource = DeviceMemoryResource(self._id) else: - from cuda.core.experimental._memory import _SynchronousMemoryResource + from cuda.core._memory import _SynchronousMemoryResource self._memory_resource = _SynchronousMemoryResource(self._id) return self._memory_resource @memory_resource.setter def memory_resource(self, mr): - from cuda.core.experimental._memory import MemoryResource + from cuda.core._memory import MemoryResource assert_type(mr, MemoryResource) self._memory_resource = mr @@ -1237,7 +1237,7 @@ class Device: Acts as an entry point of this object. Users always start a code by calling this method, e.g. - >>> from cuda.core.experimental import Device + >>> from cuda.core import Device >>> dev0 = Device(0) >>> dev0.set_current() >>> # ... do work on device 0 ... diff --git a/cuda_core/cuda/core/experimental/_dlpack.pxd b/cuda_core/cuda/core/_dlpack.pxd similarity index 97% rename from cuda_core/cuda/core/experimental/_dlpack.pxd rename to cuda_core/cuda/core/_dlpack.pxd index d61b6a2bca..7b886cae10 100644 --- a/cuda_core/cuda/core/experimental/_dlpack.pxd +++ b/cuda_core/cuda/core/_dlpack.pxd @@ -14,7 +14,7 @@ from libc.stdint cimport uint64_t from libc.stdint cimport intptr_t -cdef extern from "include/dlpack.h" nogil: +cdef extern from "_include/dlpack.h" nogil: """ #define DLPACK_TENSOR_UNUSED_NAME "dltensor" #define DLPACK_VERSIONED_TENSOR_UNUSED_NAME "dltensor_versioned" diff --git a/cuda_core/cuda/core/experimental/_dlpack.pyx b/cuda_core/cuda/core/_dlpack.pyx similarity index 100% rename from cuda_core/cuda/core/experimental/_dlpack.pyx rename to cuda_core/cuda/core/_dlpack.pyx diff --git a/cuda_core/cuda/core/experimental/_event.pxd b/cuda_core/cuda/core/_event.pxd similarity index 100% rename from cuda_core/cuda/core/experimental/_event.pxd rename to cuda_core/cuda/core/_event.pxd diff --git a/cuda_core/cuda/core/experimental/_event.pyx b/cuda_core/cuda/core/_event.pyx similarity index 98% rename from cuda_core/cuda/core/experimental/_event.pyx rename to cuda_core/cuda/core/_event.pyx index 149c92b8e1..e97fdfbab4 100644 --- a/cuda_core/cuda/core/experimental/_event.pyx +++ b/cuda_core/cuda/core/_event.pyx @@ -8,7 +8,7 @@ cimport cpython from libc.stdint cimport uintptr_t from libc.string cimport memcpy from cuda.bindings cimport cydriver -from cuda.core.experimental._utils.cuda_utils cimport ( +from cuda.core._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN ) @@ -18,8 +18,8 @@ from dataclasses import dataclass import multiprocessing from typing import TYPE_CHECKING, Optional -from cuda.core.experimental._context import Context -from cuda.core.experimental._utils.cuda_utils import ( +from cuda.core._context import Context +from cuda.core._utils.cuda_utils import ( CUDAError, check_multiprocessing_start_method, driver, diff --git a/cuda_core/cuda/core/experimental/_graph.py b/cuda_core/cuda/core/_graph.py similarity index 99% rename from cuda_core/cuda/core/experimental/_graph.py rename to cuda_core/cuda/core/_graph.py index a82bd70f55..df51126bb0 100644 --- a/cuda_core/cuda/core/experimental/_graph.py +++ b/cuda_core/cuda/core/_graph.py @@ -9,8 +9,8 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from cuda.core.experimental._stream import Stream -from cuda.core.experimental._utils.cuda_utils import ( + from cuda.core._stream import Stream +from cuda.core._utils.cuda_utils import ( driver, get_binding_version, handle_return, diff --git a/cuda_core/cuda/core/experimental/include/dlpack.h b/cuda_core/cuda/core/_include/dlpack.h similarity index 100% rename from cuda_core/cuda/core/experimental/include/dlpack.h rename to cuda_core/cuda/core/_include/dlpack.h diff --git a/cuda_core/cuda/core/experimental/include/layout.hpp b/cuda_core/cuda/core/_include/layout.hpp similarity index 100% rename from cuda_core/cuda/core/experimental/include/layout.hpp rename to cuda_core/cuda/core/_include/layout.hpp diff --git a/cuda_core/cuda/core/experimental/include/utility.hpp b/cuda_core/cuda/core/_include/utility.hpp similarity index 100% rename from cuda_core/cuda/core/experimental/include/utility.hpp rename to cuda_core/cuda/core/_include/utility.hpp diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pxd b/cuda_core/cuda/core/_kernel_arg_handler.pxd similarity index 100% rename from cuda_core/cuda/core/experimental/_kernel_arg_handler.pxd rename to cuda_core/cuda/core/_kernel_arg_handler.pxd diff --git a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx b/cuda_core/cuda/core/_kernel_arg_handler.pyx similarity index 99% rename from cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx rename to cuda_core/cuda/core/_kernel_arg_handler.pyx index e805b8ad66..882ca5eaab 100644 --- a/cuda_core/cuda/core/experimental/_kernel_arg_handler.pyx +++ b/cuda_core/cuda/core/_kernel_arg_handler.pyx @@ -15,8 +15,8 @@ import ctypes import numpy -from cuda.core.experimental._memory import Buffer -from cuda.core.experimental._utils.cuda_utils import driver +from cuda.core._memory import Buffer +from cuda.core._utils.cuda_utils import driver from cuda.bindings cimport cydriver diff --git a/cuda_core/cuda/core/experimental/_launch_config.pxd b/cuda_core/cuda/core/_launch_config.pxd similarity index 100% rename from cuda_core/cuda/core/experimental/_launch_config.pxd rename to cuda_core/cuda/core/_launch_config.pxd diff --git a/cuda_core/cuda/core/experimental/_launch_config.pyx b/cuda_core/cuda/core/_launch_config.pyx similarity index 98% rename from cuda_core/cuda/core/experimental/_launch_config.pyx rename to cuda_core/cuda/core/_launch_config.pyx index 1f9de3f999..032c40bd78 100644 --- a/cuda_core/cuda/core/experimental/_launch_config.pyx +++ b/cuda_core/cuda/core/_launch_config.pyx @@ -2,14 +2,14 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.core.experimental._utils.cuda_utils cimport ( +from cuda.core._utils.cuda_utils cimport ( HANDLE_RETURN, ) import threading -from cuda.core.experimental._device import Device -from cuda.core.experimental._utils.cuda_utils import ( +from cuda.core._device import Device +from cuda.core._utils.cuda_utils import ( CUDAError, cast_to_3_tuple, driver, diff --git a/cuda_core/cuda/core/experimental/_launcher.pyx b/cuda_core/cuda/core/_launcher.pyx similarity index 90% rename from cuda_core/cuda/core/experimental/_launcher.pyx rename to cuda_core/cuda/core/_launcher.pyx index 0e1b9a7d4b..94dc5d02b4 100644 --- a/cuda_core/cuda/core/experimental/_launcher.pyx +++ b/cuda_core/cuda/core/_launcher.pyx @@ -6,19 +6,19 @@ from libc.stdint cimport uintptr_t from cuda.bindings cimport cydriver -from cuda.core.experimental._launch_config cimport LaunchConfig -from cuda.core.experimental._kernel_arg_handler cimport ParamHolder -from cuda.core.experimental._stream cimport Stream_accept, Stream -from cuda.core.experimental._utils.cuda_utils cimport ( +from cuda.core._launch_config cimport LaunchConfig +from cuda.core._kernel_arg_handler cimport ParamHolder +from cuda.core._stream cimport Stream_accept, Stream +from cuda.core._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN, ) import threading -from cuda.core.experimental._module import Kernel -from cuda.core.experimental._stream import Stream -from cuda.core.experimental._utils.cuda_utils import ( +from cuda.core._module import Kernel +from cuda.core._stream import Stream +from cuda.core._utils.cuda_utils import ( _reduce_3_tuple, get_binding_version, ) diff --git a/cuda_core/cuda/core/experimental/_layout.pxd b/cuda_core/cuda/core/_layout.pxd similarity index 99% rename from cuda_core/cuda/core/experimental/_layout.pxd rename to cuda_core/cuda/core/_layout.pxd index ff83449e5d..918a104f2f 100644 --- a/cuda_core/cuda/core/experimental/_layout.pxd +++ b/cuda_core/cuda/core/_layout.pxd @@ -18,7 +18,7 @@ ctypedef uint32_t property_mask_t ctypedef vector.vector[stride_t] extents_strides_t ctypedef vector.vector[axis_t] axis_vec_t -from cuda.core.experimental._utils cimport cuda_utils +from cuda.core._utils cimport cuda_utils ctypedef fused integer_t: @@ -26,7 +26,7 @@ ctypedef fused integer_t: int32_t -cdef extern from "include/layout.hpp": +cdef extern from "_include/layout.hpp": cdef int STRIDED_LAYOUT_MAX_NDIM cdef axes_mask_t AXES_MASK_ALL diff --git a/cuda_core/cuda/core/experimental/_layout.pyx b/cuda_core/cuda/core/_layout.pyx similarity index 100% rename from cuda_core/cuda/core/experimental/_layout.pyx rename to cuda_core/cuda/core/_layout.pyx diff --git a/cuda_core/cuda/core/experimental/_linker.py b/cuda_core/cuda/core/_linker.py similarity index 98% rename from cuda_core/cuda/core/experimental/_linker.py rename to cuda_core/cuda/core/_linker.py index 2c94fb9b02..1f6f221a39 100644 --- a/cuda_core/cuda/core/experimental/_linker.py +++ b/cuda_core/cuda/core/_linker.py @@ -15,10 +15,10 @@ if TYPE_CHECKING: import cuda.bindings -from cuda.core.experimental._device import Device -from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils.clear_error_support import assert_type -from cuda.core.experimental._utils.cuda_utils import check_or_create_options, driver, handle_return, is_sequence +from cuda.core._device import Device +from cuda.core._module import ObjectCode +from cuda.core._utils.clear_error_support import assert_type +from cuda.core._utils.cuda_utils import check_or_create_options, driver, handle_return, is_sequence # TODO: revisit this treatment for py313t builds _driver = None # populated if nvJitLink cannot be used @@ -388,7 +388,7 @@ def _exception_manager(self): class Linker: """Represent a linking machinery to link one or multiple object codes into - :obj:`~cuda.core.experimental._module.ObjectCode` with the specified options. + :obj:`~cuda.core._module.ObjectCode` with the specified options. This object provides a unified interface to multiple underlying linker libraries (such as nvJitLink or cuLink* from CUDA driver). diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.pxd b/cuda_core/cuda/core/_memory/__init__.pxd similarity index 100% rename from cuda_core/cuda/core/experimental/_memory/__init__.pxd rename to cuda_core/cuda/core/_memory/__init__.pxd diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/_memory/__init__.py similarity index 100% rename from cuda_core/cuda/core/experimental/_memory/__init__.py rename to cuda_core/cuda/core/_memory/__init__.py diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd b/cuda_core/cuda/core/_memory/_buffer.pxd similarity index 92% rename from cuda_core/cuda/core/experimental/_memory/_buffer.pxd rename to cuda_core/cuda/core/_memory/_buffer.pxd index b581dcd293..730e448f63 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pxd +++ b/cuda_core/cuda/core/_memory/_buffer.pxd @@ -4,7 +4,7 @@ from libc.stdint cimport uintptr_t -from cuda.core.experimental._stream cimport Stream +from cuda.core._stream cimport Stream cdef struct _MemAttrs: diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/_memory/_buffer.pyx similarity index 97% rename from cuda_core/cuda/core/experimental/_memory/_buffer.pyx rename to cuda_core/cuda/core/_memory/_buffer.pyx index 1d6f1b3705..b92c9d51ce 100644 --- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx +++ b/cuda_core/cuda/core/_memory/_buffer.pyx @@ -9,12 +9,12 @@ from libc.stdint cimport uint8_t, uint16_t, uint32_t, uintptr_t from cpython.buffer cimport PyObject_GetBuffer, PyBuffer_Release, Py_buffer, PyBUF_SIMPLE from cuda.bindings cimport cydriver -from cuda.core.experimental._memory._device_memory_resource import DeviceMemoryResource -from cuda.core.experimental._memory._pinned_memory_resource import PinnedMemoryResource -from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor, IPCDataForBuffer -from cuda.core.experimental._memory cimport _ipc -from cuda.core.experimental._stream cimport Stream_accept, Stream -from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core._memory._device_memory_resource import DeviceMemoryResource +from cuda.core._memory._pinned_memory_resource import PinnedMemoryResource +from cuda.core._memory._ipc cimport IPCBufferDescriptor, IPCDataForBuffer +from cuda.core._memory cimport _ipc +from cuda.core._stream cimport Stream_accept, Stream +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN import sys from typing import TypeVar, Union @@ -24,9 +24,9 @@ if sys.version_info >= (3, 12): else: BufferProtocol = object -from cuda.core.experimental._dlpack import DLDeviceType, make_py_capsule -from cuda.core.experimental._utils.cuda_utils import driver -from cuda.core.experimental._device import Device +from cuda.core._dlpack import DLDeviceType, make_py_capsule +from cuda.core._utils.cuda_utils import driver +from cuda.core._device import Device __all__ = ['Buffer', 'MemoryResource'] diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd similarity index 66% rename from cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd rename to cuda_core/cuda/core/_memory/_device_memory_resource.pxd index 17ee12e54f..c293d72750 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd +++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pxd @@ -2,8 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.core.experimental._memory._memory_pool cimport _MemPool -from cuda.core.experimental._memory._ipc cimport IPCDataForMR +from cuda.core._memory._memory_pool cimport _MemPool +from cuda.core._memory._ipc cimport IPCDataForMR cdef class DeviceMemoryResource(_MemPool): diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx similarity index 96% rename from cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx rename to cuda_core/cuda/core/_memory/_device_memory_resource.pyx index dc6150d75a..d0cc82184a 100644 --- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_device_memory_resource.pyx @@ -5,10 +5,10 @@ from __future__ import annotations from cuda.bindings cimport cydriver -from cuda.core.experimental._memory._memory_pool cimport _MemPool, _MemPoolOptions -from cuda.core.experimental._memory cimport _ipc -from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle -from cuda.core.experimental._utils.cuda_utils cimport ( +from cuda.core._memory._memory_pool cimport _MemPool, _MemPoolOptions +from cuda.core._memory cimport _ipc +from cuda.core._memory._ipc cimport IPCAllocationHandle +from cuda.core._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN, ) @@ -19,7 +19,7 @@ from typing import TYPE_CHECKING import platform # no-cython-lint import uuid -from cuda.core.experimental._utils.cuda_utils import check_multiprocessing_start_method +from cuda.core._utils.cuda_utils import check_multiprocessing_start_method if TYPE_CHECKING: from .._device import Device diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd b/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd similarity index 77% rename from cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd rename to cuda_core/cuda/core/_memory/_graph_memory_resource.pxd index f9c7798e76..2f6c35d72e 100644 --- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pxd +++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pxd @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.core.experimental._memory._buffer cimport MemoryResource +from cuda.core._memory._buffer cimport MemoryResource cdef class cyGraphMemoryResource(MemoryResource): diff --git a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx similarity index 96% rename from cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx rename to cuda_core/cuda/core/_memory/_graph_memory_resource.pyx index c65354b612..bda075c201 100644 --- a/cuda_core/cuda/core/experimental/_memory/_graph_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_graph_memory_resource.pyx @@ -7,15 +7,15 @@ from __future__ import annotations from libc.stdint cimport intptr_t from cuda.bindings cimport cydriver -from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource -from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream -from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core._memory._buffer cimport Buffer, MemoryResource +from cuda.core._stream cimport default_stream, Stream_accept, Stream +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN from functools import cache from typing import TYPE_CHECKING if TYPE_CHECKING: - from cuda.core.experimental._memory.buffer import DevicePointerT + from cuda.core._memory.buffer import DevicePointerT __all__ = ['GraphMemoryResource'] diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd b/cuda_core/cuda/core/_memory/_ipc.pxd similarity index 92% rename from cuda_core/cuda/core/experimental/_memory/_ipc.pxd rename to cuda_core/cuda/core/_memory/_ipc.pxd index 3fed2b7188..0c7375efdb 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd +++ b/cuda_core/cuda/core/_memory/_ipc.pxd @@ -3,8 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver -from cuda.core.experimental._memory._buffer cimport Buffer -from cuda.core.experimental._memory._memory_pool cimport _MemPool +from cuda.core._memory._buffer cimport Buffer +from cuda.core._memory._memory_pool cimport _MemPool # Holds _MemPool objects imported by this process. This enables diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/_memory/_ipc.pyx similarity index 96% rename from cuda_core/cuda/core/experimental/_memory/_ipc.pyx rename to cuda_core/cuda/core/_memory/_ipc.pyx index 980e814e11..793e4168d7 100644 --- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx +++ b/cuda_core/cuda/core/_memory/_ipc.pyx @@ -7,10 +7,10 @@ from libc.stdint cimport uintptr_t from libc.string cimport memcpy from cuda.bindings cimport cydriver -from cuda.core.experimental._memory._buffer cimport Buffer -from cuda.core.experimental._stream cimport default_stream -from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN -from cuda.core.experimental._utils.cuda_utils import check_multiprocessing_start_method +from cuda.core._memory._buffer cimport Buffer +from cuda.core._stream cimport default_stream +from cuda.core._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core._utils.cuda_utils import check_multiprocessing_start_method import multiprocessing import os diff --git a/cuda_core/cuda/core/experimental/_memory/_legacy.py b/cuda_core/cuda/core/_memory/_legacy.py similarity index 89% rename from cuda_core/cuda/core/experimental/_memory/_legacy.py rename to cuda_core/cuda/core/_memory/_legacy.py index 09ea0e15d2..317494ea9e 100644 --- a/cuda_core/cuda/core/experimental/_memory/_legacy.py +++ b/cuda_core/cuda/core/_memory/_legacy.py @@ -6,16 +6,16 @@ from typing import TYPE_CHECKING -from cuda.core.experimental._memory._buffer import Buffer, MemoryResource -from cuda.core.experimental._utils.cuda_utils import ( +from cuda.core._memory._buffer import Buffer, MemoryResource +from cuda.core._utils.cuda_utils import ( _check_driver_error as raise_if_driver_error, ) -from cuda.core.experimental._utils.cuda_utils import ( +from cuda.core._utils.cuda_utils import ( driver, ) if TYPE_CHECKING: - from cuda.core.experimental._memory.buffer import DevicePointerT + from cuda.core._memory.buffer import DevicePointerT __all__ = ["LegacyPinnedMemoryResource", "_SynchronousMemoryResource"] @@ -43,7 +43,7 @@ def allocate(self, size, stream=None) -> Buffer: The allocated buffer object, which is accessible on both host and device. """ if stream is None: - from cuda.core.experimental._stream import default_stream + from cuda.core._stream import default_stream stream = default_stream() err, ptr = driver.cuMemAllocHost(size) @@ -93,7 +93,7 @@ def __init__(self, device_id): def allocate(self, size, stream=None) -> Buffer: if stream is None: - from cuda.core.experimental._stream import default_stream + from cuda.core._stream import default_stream stream = default_stream() err, ptr = driver.cuMemAlloc(size) diff --git a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pxd b/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd similarity index 75% rename from cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pxd rename to cuda_core/cuda/core/_memory/_managed_memory_resource.pxd index 3e9aed7bee..46e00cd4cb 100644 --- a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pxd +++ b/cuda_core/cuda/core/_memory/_managed_memory_resource.pxd @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.core.experimental._memory._memory_pool cimport _MemPool +from cuda.core._memory._memory_pool cimport _MemPool cdef class ManagedMemoryResource(_MemPool): diff --git a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx similarity index 96% rename from cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx rename to cuda_core/cuda/core/_memory/_managed_memory_resource.pyx index a16a890332..1b8b03f8f2 100644 --- a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_managed_memory_resource.pyx @@ -5,8 +5,8 @@ from __future__ import annotations from cuda.bindings cimport cydriver -from cuda.core.experimental._memory._memory_pool cimport _MemPool, _MemPoolOptions -from cuda.core.experimental._utils.cuda_utils cimport ( +from cuda.core._memory._memory_pool cimport _MemPool, _MemPoolOptions +from cuda.core._utils.cuda_utils cimport ( check_or_create_options, ) diff --git a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd b/cuda_core/cuda/core/_memory/_memory_pool.pxd similarity index 85% rename from cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd rename to cuda_core/cuda/core/_memory/_memory_pool.pxd index 68b2e6438f..8d9961b68b 100644 --- a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd +++ b/cuda_core/cuda/core/_memory/_memory_pool.pxd @@ -3,8 +3,8 @@ # SPDX-License-Identifier: Apache-2.0 from cuda.bindings cimport cydriver -from cuda.core.experimental._memory._buffer cimport MemoryResource -from cuda.core.experimental._memory._ipc cimport IPCDataForMR +from cuda.core._memory._buffer cimport MemoryResource +from cuda.core._memory._ipc cimport IPCDataForMR cdef class _MemPool(MemoryResource): diff --git a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx b/cuda_core/cuda/core/_memory/_memory_pool.pyx similarity index 97% rename from cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx rename to cuda_core/cuda/core/_memory/_memory_pool.pyx index ef7b692de4..f1b72d47b5 100644 --- a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx +++ b/cuda_core/cuda/core/_memory/_memory_pool.pyx @@ -10,10 +10,10 @@ from libc.string cimport memset from cpython.mem cimport PyMem_Malloc, PyMem_Free from cuda.bindings cimport cydriver -from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource -from cuda.core.experimental._memory cimport _ipc -from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream -from cuda.core.experimental._utils.cuda_utils cimport ( +from cuda.core._memory._buffer cimport Buffer, MemoryResource +from cuda.core._memory cimport _ipc +from cuda.core._stream cimport default_stream, Stream_accept, Stream +from cuda.core._utils.cuda_utils cimport ( HANDLE_RETURN, ) @@ -21,10 +21,10 @@ from typing import TYPE_CHECKING import platform # no-cython-lint import weakref -from cuda.core.experimental._utils.cuda_utils import driver +from cuda.core._utils.cuda_utils import driver if TYPE_CHECKING: - from cuda.core.experimental._memory.buffer import DevicePointerT + from cuda.core._memory.buffer import DevicePointerT from .._device import Device diff --git a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pxd b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd similarity index 60% rename from cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pxd rename to cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd index df225c1860..a8262d9bd8 100644 --- a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pxd +++ b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pxd @@ -2,8 +2,8 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.core.experimental._memory._memory_pool cimport _MemPool -from cuda.core.experimental._memory._ipc cimport IPCDataForMR +from cuda.core._memory._memory_pool cimport _MemPool +from cuda.core._memory._ipc cimport IPCDataForMR cdef class PinnedMemoryResource(_MemPool): diff --git a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx similarity index 96% rename from cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx rename to cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx index 1dba1f2423..b2a9db4594 100644 --- a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx +++ b/cuda_core/cuda/core/_memory/_pinned_memory_resource.pyx @@ -5,10 +5,10 @@ from __future__ import annotations from cuda.bindings cimport cydriver -from cuda.core.experimental._memory._memory_pool cimport _MemPool, _MemPoolOptions -from cuda.core.experimental._memory cimport _ipc -from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle -from cuda.core.experimental._utils.cuda_utils cimport ( +from cuda.core._memory._memory_pool cimport _MemPool, _MemPoolOptions +from cuda.core._memory cimport _ipc +from cuda.core._memory._ipc cimport IPCAllocationHandle +from cuda.core._utils.cuda_utils cimport ( check_or_create_options, HANDLE_RETURN, ) @@ -22,7 +22,7 @@ import threading import uuid import warnings -from cuda.core.experimental._utils.cuda_utils import check_multiprocessing_start_method +from cuda.core._utils.cuda_utils import check_multiprocessing_start_method # Cache to ensure NUMA warning is only raised once per process diff --git a/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py similarity index 98% rename from cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py rename to cuda_core/cuda/core/_memory/_virtual_memory_resource.py index 2806e2d0d5..43da00744a 100644 --- a/cuda_core/cuda/core/experimental/_memory/_virtual_memory_resource.py +++ b/cuda_core/cuda/core/_memory/_virtual_memory_resource.py @@ -7,20 +7,20 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, Iterable, Literal, Union -from cuda.core.experimental._device import Device -from cuda.core.experimental._memory._buffer import Buffer, MemoryResource -from cuda.core.experimental._utils.cuda_utils import ( +from cuda.core._device import Device +from cuda.core._memory._buffer import Buffer, MemoryResource +from cuda.core._utils.cuda_utils import ( Transaction, check_or_create_options, driver, get_binding_version, ) -from cuda.core.experimental._utils.cuda_utils import ( +from cuda.core._utils.cuda_utils import ( _check_driver_error as raise_if_driver_error, ) if TYPE_CHECKING: - from cuda.core.experimental._stream import Stream + from cuda.core._stream import Stream __all__ = ["VirtualMemoryResourceOptions", "VirtualMemoryResource"] diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/_memoryview.pyx similarity index 99% rename from cuda_core/cuda/core/experimental/_memoryview.pyx rename to cuda_core/cuda/core/_memoryview.pyx index 6c995b7ef0..24cf3ad99f 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/_memoryview.pyx @@ -4,8 +4,8 @@ from ._dlpack cimport * from libc.stdint cimport intptr_t -from cuda.core.experimental._layout cimport _StridedLayout -from cuda.core.experimental._stream import Stream +from cuda.core._layout cimport _StridedLayout +from cuda.core._stream import Stream import functools import warnings @@ -13,10 +13,10 @@ from typing import Optional import numpy -from cuda.core.experimental._utils.cuda_utils import handle_return, driver +from cuda.core._utils.cuda_utils import handle_return, driver -from cuda.core.experimental._memory import Buffer +from cuda.core._memory import Buffer # TODO(leofang): support NumPy structured dtypes diff --git a/cuda_core/cuda/core/experimental/_module.py b/cuda_core/cuda/core/_module.py similarity index 98% rename from cuda_core/cuda/core/experimental/_module.py rename to cuda_core/cuda/core/_module.py index 9af722465b..fbea314406 100644 --- a/cuda_core/cuda/core/experimental/_module.py +++ b/cuda_core/cuda/core/_module.py @@ -7,15 +7,15 @@ from typing import Union from warnings import warn -from cuda.core.experimental._device import Device -from cuda.core.experimental._launch_config import LaunchConfig, _to_native_launch_config -from cuda.core.experimental._stream import Stream -from cuda.core.experimental._utils.clear_error_support import ( +from cuda.core._device import Device +from cuda.core._launch_config import LaunchConfig, _to_native_launch_config +from cuda.core._stream import Stream +from cuda.core._utils.clear_error_support import ( assert_type, assert_type_str_or_bytes_like, raise_code_path_meant_to_be_unreachable, ) -from cuda.core.experimental._utils.cuda_utils import driver, get_binding_version, handle_return, precondition +from cuda.core._utils.cuda_utils import driver, get_binding_version, handle_return, precondition _backend = { "old": { @@ -453,7 +453,7 @@ class ObjectCode: This class has no default constructor. If you already have a cubin that you would like to load, use the :meth:`from_cubin` alternative constructor. Constructing directly from all other possible code types should be avoided in favor of compilation through - :class:`~cuda.core.experimental.Program` + :class:`~cuda.core.Program` Note ---- diff --git a/cuda_core/cuda/core/experimental/_program.py b/cuda_core/cuda/core/_program.py similarity index 99% rename from cuda_core/cuda/core/experimental/_program.py rename to cuda_core/cuda/core/_program.py index f3ad9af644..121dd13963 100644 --- a/cuda_core/cuda/core/experimental/_program.py +++ b/cuda_core/cuda/core/_program.py @@ -13,11 +13,11 @@ if TYPE_CHECKING: import cuda.bindings -from cuda.core.experimental._device import Device -from cuda.core.experimental._linker import Linker, LinkerHandleT, LinkerOptions -from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils.clear_error_support import assert_type -from cuda.core.experimental._utils.cuda_utils import ( +from cuda.core._device import Device +from cuda.core._linker import Linker, LinkerHandleT, LinkerOptions +from cuda.core._module import ObjectCode +from cuda.core._utils.clear_error_support import assert_type +from cuda.core._utils.cuda_utils import ( CUDAError, _handle_boolean_option, check_or_create_options, diff --git a/cuda_core/cuda/core/experimental/_stream.pxd b/cuda_core/cuda/core/_stream.pxd similarity index 100% rename from cuda_core/cuda/core/experimental/_stream.pxd rename to cuda_core/cuda/core/_stream.pxd diff --git a/cuda_core/cuda/core/experimental/_stream.pyx b/cuda_core/cuda/core/_stream.pyx similarity index 97% rename from cuda_core/cuda/core/experimental/_stream.pyx rename to cuda_core/cuda/core/_stream.pyx index 87ec4a691a..b724f9aee3 100644 --- a/cuda_core/cuda/core/experimental/_stream.pyx +++ b/cuda_core/cuda/core/_stream.pyx @@ -9,8 +9,8 @@ from libc.stdlib cimport strtol, getenv from cuda.bindings cimport cydriver -from cuda.core.experimental._event cimport Event as cyEvent -from cuda.core.experimental._utils.cuda_utils cimport ( +from cuda.core._event cimport Event as cyEvent +from cuda.core._utils.cuda_utils cimport ( check_or_create_options, CU_CONTEXT_INVALID, get_device_from_ctx, @@ -24,11 +24,11 @@ from typing import TYPE_CHECKING, Optional, Protocol, Union if TYPE_CHECKING: import cuda.bindings - from cuda.core.experimental._device import Device -from cuda.core.experimental._context import Context -from cuda.core.experimental._event import Event, EventOptions -from cuda.core.experimental._graph import GraphBuilder -from cuda.core.experimental._utils.cuda_utils import ( + from cuda.core._device import Device +from cuda.core._context import Context +from cuda.core._event import Event, EventOptions +from cuda.core._graph import GraphBuilder +from cuda.core._utils.cuda_utils import ( driver, ) @@ -311,7 +311,7 @@ cdef class Stream: context is set current after a stream is created. """ - from cuda.core.experimental._device import Device # avoid circular import + from cuda.core._device import Device # avoid circular import self._get_device_and_context() return Device((self._device_id)) diff --git a/cuda_core/cuda/core/experimental/_system.py b/cuda_core/cuda/core/_system.py similarity index 95% rename from cuda_core/cuda/core/experimental/_system.py rename to cuda_core/cuda/core/_system.py index ac157f5760..6f06587b46 100644 --- a/cuda_core/cuda/core/experimental/_system.py +++ b/cuda_core/cuda/core/_system.py @@ -4,8 +4,8 @@ import warnings -from cuda.core.experimental._device import Device -from cuda.core.experimental._utils.cuda_utils import driver, handle_return, runtime +from cuda.core._device import Device +from cuda.core._utils.cuda_utils import driver, handle_return, runtime class System: diff --git a/cuda_core/cuda/core/experimental/_utils/__init__.pxd b/cuda_core/cuda/core/_utils/__init__.pxd similarity index 100% rename from cuda_core/cuda/core/experimental/_utils/__init__.pxd rename to cuda_core/cuda/core/_utils/__init__.pxd diff --git a/cuda_core/cuda/core/experimental/_utils/__init__.py b/cuda_core/cuda/core/_utils/__init__.py similarity index 100% rename from cuda_core/cuda/core/experimental/_utils/__init__.py rename to cuda_core/cuda/core/_utils/__init__.py diff --git a/cuda_core/cuda/core/experimental/_utils/clear_error_support.py b/cuda_core/cuda/core/_utils/clear_error_support.py similarity index 100% rename from cuda_core/cuda/core/experimental/_utils/clear_error_support.py rename to cuda_core/cuda/core/_utils/clear_error_support.py diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd b/cuda_core/cuda/core/_utils/cuda_utils.pxd similarity index 100% rename from cuda_core/cuda/core/experimental/_utils/cuda_utils.pxd rename to cuda_core/cuda/core/_utils/cuda_utils.pxd diff --git a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx b/cuda_core/cuda/core/_utils/cuda_utils.pyx similarity index 98% rename from cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx rename to cuda_core/cuda/core/_utils/cuda_utils.pyx index 4489871747..0c3f6521a4 100644 --- a/cuda_core/cuda/core/experimental/_utils/cuda_utils.pyx +++ b/cuda_core/cuda/core/_utils/cuda_utils.pyx @@ -20,8 +20,8 @@ except ImportError: from cuda import cudart as runtime from cuda import nvrtc -from cuda.core.experimental._utils.driver_cu_result_explanations import DRIVER_CU_RESULT_EXPLANATIONS -from cuda.core.experimental._utils.runtime_cuda_error_explanations import RUNTIME_CUDA_ERROR_EXPLANATIONS +from cuda.core._utils.driver_cu_result_explanations import DRIVER_CU_RESULT_EXPLANATIONS +from cuda.core._utils.runtime_cuda_error_explanations import RUNTIME_CUDA_ERROR_EXPLANATIONS class CUDAError(Exception): diff --git a/cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py b/cuda_core/cuda/core/_utils/driver_cu_result_explanations.py similarity index 100% rename from cuda_core/cuda/core/experimental/_utils/driver_cu_result_explanations.py rename to cuda_core/cuda/core/_utils/driver_cu_result_explanations.py diff --git a/cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py b/cuda_core/cuda/core/_utils/runtime_cuda_error_explanations.py similarity index 100% rename from cuda_core/cuda/core/experimental/_utils/runtime_cuda_error_explanations.py rename to cuda_core/cuda/core/_utils/runtime_cuda_error_explanations.py diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py index 92174468d1..95d548985f 100644 --- a/cuda_core/cuda/core/experimental/__init__.py +++ b/cuda_core/cuda/core/experimental/__init__.py @@ -2,43 +2,60 @@ # # SPDX-License-Identifier: Apache-2.0 -try: - from cuda import bindings -except ImportError: - raise ImportError("cuda.bindings 12.x or 13.x must be installed") from None -else: - cuda_major, cuda_minor = bindings.__version__.split(".")[:2] - if cuda_major not in ("12", "13"): - raise ImportError("cuda.bindings 12.x or 13.x must be installed") - -import importlib - -subdir = f"cu{cuda_major}" -try: - versioned_mod = importlib.import_module(f".{subdir}", __package__) - # Import all symbols from the module - globals().update(versioned_mod.__dict__) -except ImportError: - # This is not a wheel build, but a conda or local build, do nothing - pass -else: - del versioned_mod -finally: - del bindings, importlib, subdir, cuda_major, cuda_minor - -from cuda.core.experimental import utils # noqa: E402 -from cuda.core.experimental._device import Device # noqa: E402 -from cuda.core.experimental._event import Event, EventOptions # noqa: E402 -from cuda.core.experimental._graph import ( # noqa: E402 +""" +Backward compatibility stubs for cuda.core.experimental namespace. + +This module provides forwarding stubs that import from the new cuda.core.* +locations and emit deprecation warnings. Users should migrate to importing +directly from cuda.core instead of cuda.core.experimental. + +The experimental namespace will be removed in a future release. + +Note: Underscored modules (e.g., _device, _memory) are not public APIs +and are intentionally not made accessible here. +""" + +import warnings + + +def _warn_deprecated(): + """Emit a deprecation warning for using the experimental namespace. + + Note: This warning is only when the experimental module is first imported. + Subsequent accesses to attributes (like utils, Device, etc.) do not trigger + additional warnings since they are already set in the module namespace. + """ + warnings.warn( + "The cuda.core.experimental namespace is deprecated. " + "Please import directly from cuda.core instead. " + "For example, use 'from cuda.core import Device' instead of " + "'from cuda.core.experimental import Device'. " + "The experimental namespace will be removed in a future release.", + DeprecationWarning, + stacklevel=3, + ) + + +# Import from new locations and re-export +_warn_deprecated() + +from cuda.core import utils # noqa: E402 + +# Make utils accessible as a submodule for backward compatibility +__import__("sys").modules[__spec__.name + ".utils"] = utils +from cuda.core._device import Device # noqa: E402 +from cuda.core._event import Event, EventOptions # noqa: E402 +from cuda.core._graph import ( # noqa: E402 Graph, GraphBuilder, GraphCompleteOptions, GraphDebugPrintOptions, ) -from cuda.core.experimental._launch_config import LaunchConfig # noqa: E402 -from cuda.core.experimental._launcher import launch # noqa: E402 -from cuda.core.experimental._linker import Linker, LinkerOptions # noqa: E402 -from cuda.core.experimental._memory import ( # noqa: E402 +from cuda.core._launch_config import LaunchConfig # noqa: E402 +from cuda.core._launcher import launch # noqa: E402 +from cuda.core._layout import _StridedLayout # noqa: E402 +from cuda.core._linker import Linker, LinkerOptions # noqa: E402 +from cuda.core._memory import ( # noqa: E402 Buffer, DeviceMemoryResource, DeviceMemoryResourceOptions, @@ -52,10 +69,14 @@ VirtualMemoryResource, VirtualMemoryResourceOptions, ) -from cuda.core.experimental._module import Kernel, ObjectCode # noqa: E402 -from cuda.core.experimental._program import Program, ProgramOptions # noqa: E402 -from cuda.core.experimental._stream import Stream, StreamOptions # noqa: E402 -from cuda.core.experimental._system import System # noqa: E402 +from cuda.core._memoryview import ( # noqa: E402 + StridedMemoryView, # noqa: E402 + args_viewable_as_strided_memory, # noqa: E402 +) +from cuda.core._module import Kernel, ObjectCode # noqa: E402 +from cuda.core._program import Program, ProgramOptions # noqa: E402 +from cuda.core._stream import Stream, StreamOptions # noqa: E402 +from cuda.core._system import System # noqa: E402 system = System() __import__("sys").modules[__spec__.name + ".system"] = system diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/utils.py similarity index 82% rename from cuda_core/cuda/core/experimental/utils.py rename to cuda_core/cuda/core/utils.py index 32f62918f6..f15d924277 100644 --- a/cuda_core/cuda/core/experimental/utils.py +++ b/cuda_core/cuda/core/utils.py @@ -2,7 +2,7 @@ # # SPDX-License-Identifier: Apache-2.0 -from cuda.core.experimental._memoryview import ( +from cuda.core._memoryview import ( StridedMemoryView, # noqa: F401 args_viewable_as_strided_memory, # noqa: F401 ) diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst index 51e505b59d..e24334e476 100644 --- a/cuda_core/docs/source/api.rst +++ b/cuda_core/docs/source/api.rst @@ -1,14 +1,16 @@ .. SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. .. SPDX-License-Identifier: Apache-2.0 -.. module:: cuda.core.experimental +.. module:: cuda.core -``cuda.core.experimental`` API Reference -======================================== +``cuda.core`` API Reference +=========================== -All of the APIs listed (or cross-referenced from) below are considered *experimental* -and subject to future changes without deprecation notice. Once stabilized they will be -moved out of the ``experimental`` namespace. +This is the main API reference for ``cuda.core``. The package has not yet +reached version 1.0.0, and APIs may change between minor versions, possibly +without deprecation warnings. Once version 1.0.0 is released, APIs will +be considered stable and will follow semantic versioning with appropriate +deprecation periods for breaking changes. CUDA runtime @@ -64,11 +66,11 @@ CUDA compilation toolchain CUDA system information ----------------------- -.. automethod:: cuda.core.experimental._system.System.get_driver_version -.. automethod:: cuda.core.experimental._system.System.get_num_devices +.. automethod:: cuda.core._system.System.get_driver_version +.. automethod:: cuda.core._system.System.get_num_devices -.. module:: cuda.core.experimental.utils +.. module:: cuda.core.utils Utility functions ----------------- diff --git a/cuda_core/docs/source/api_private.rst b/cuda_core/docs/source/api_private.rst index b832cfdbce..b0fcf61291 100644 --- a/cuda_core/docs/source/api_private.rst +++ b/cuda_core/docs/source/api_private.rst @@ -8,7 +8,7 @@ via returned values from public APIs. These classes must be referred in public APIs returning their instances. -.. currentmodule:: cuda.core.experimental +.. currentmodule:: cuda.core CUDA runtime ------------ diff --git a/cuda_core/docs/source/conf.py b/cuda_core/docs/source/conf.py index bab2a2b942..e5136e040a 100644 --- a/cuda_core/docs/source/conf.py +++ b/cuda_core/docs/source/conf.py @@ -99,14 +99,14 @@ def autodoc_process_docstring(app, what, name, obj, options, lines): - if name.startswith("cuda.core.experimental._system.System"): + if name.startswith("cuda.core._system.System"): name = name.replace("._system.System", ".system") # patch the docstring (in lines) *in-place*. Should docstrings include section titles other than "Returns", # this will need to be modified to handle them. while lines: lines.pop() attr = name.split(".")[-1] - from cuda.core.experimental._system import System + from cuda.core._system import System original_lines = getattr(System, attr).__doc__.split("\n") new_lines = [] @@ -129,8 +129,8 @@ def skip_member(app, what, name, obj, skip, options): # are assumed to be properties (because cythonized # properties are not recognized as such by autodoc) excluded_dirs = [ - "cuda.core.experimental._layout", - "cuda.core.experimental._memoryview", + "cuda.core._layout", + "cuda.core._memoryview", ] if what == "attribute" and getattr(obj, "__doc__", None) is None: obj_module = getattr(getattr(obj, "__objclass__", None), "__module__", None) diff --git a/cuda_core/docs/source/getting-started.rst b/cuda_core/docs/source/getting-started.rst index 2bc7c6156e..2ac779dc2b 100644 --- a/cuda_core/docs/source/getting-started.rst +++ b/cuda_core/docs/source/getting-started.rst @@ -1,7 +1,7 @@ .. SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. .. SPDX-License-Identifier: Apache-2.0 -.. currentmodule:: cuda.core.experimental +.. currentmodule:: cuda.core Overview ======== @@ -59,7 +59,7 @@ Don't forget to use :meth:`Device.set_current`! .. code-block:: python import cupy as cp - from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch + from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch dev = Device() dev.set_current() diff --git a/cuda_core/docs/source/interoperability.rst b/cuda_core/docs/source/interoperability.rst index 2d3657abed..9871ebb5bf 100644 --- a/cuda_core/docs/source/interoperability.rst +++ b/cuda_core/docs/source/interoperability.rst @@ -1,7 +1,7 @@ .. SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. .. SPDX-License-Identifier: Apache-2.0 -.. currentmodule:: cuda.core.experimental +.. currentmodule:: cuda.core Interoperability ================ diff --git a/cuda_core/examples/cuda_graphs.py b/cuda_core/examples/cuda_graphs.py index 2d2d9833fb..9cc759b500 100644 --- a/cuda_core/examples/cuda_graphs.py +++ b/cuda_core/examples/cuda_graphs.py @@ -13,7 +13,7 @@ import time import cupy as cp -from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch +from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch def main(): diff --git a/cuda_core/examples/jit_lto_fractal.py b/cuda_core/examples/jit_lto_fractal.py index d1553f6b67..b0040708b6 100644 --- a/cuda_core/examples/jit_lto_fractal.py +++ b/cuda_core/examples/jit_lto_fractal.py @@ -25,7 +25,7 @@ import sys import cupy as cp -from cuda.core.experimental import Device, LaunchConfig, Linker, LinkerOptions, Program, ProgramOptions, launch +from cuda.core import Device, LaunchConfig, Linker, LinkerOptions, Program, ProgramOptions, launch # ################################################################################ diff --git a/cuda_core/examples/memory_ops.py b/cuda_core/examples/memory_ops.py index c4abd06e2c..123b1f6a11 100644 --- a/cuda_core/examples/memory_ops.py +++ b/cuda_core/examples/memory_ops.py @@ -16,7 +16,7 @@ import cupy as cp import numpy as np -from cuda.core.experimental import ( +from cuda.core import ( Device, LaunchConfig, LegacyPinnedMemoryResource, diff --git a/cuda_core/examples/pytorch_example.py b/cuda_core/examples/pytorch_example.py index ea067302b9..433d63c9eb 100644 --- a/cuda_core/examples/pytorch_example.py +++ b/cuda_core/examples/pytorch_example.py @@ -15,7 +15,7 @@ import sys import torch -from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch +from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch # SAXPY kernel - passing a as a pointer to avoid any type issues code = """ diff --git a/cuda_core/examples/saxpy.py b/cuda_core/examples/saxpy.py index f38caef392..aa0d77eff9 100644 --- a/cuda_core/examples/saxpy.py +++ b/cuda_core/examples/saxpy.py @@ -14,7 +14,7 @@ import sys import cupy as cp -from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch +from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch # compute out = a * x + y code = """ diff --git a/cuda_core/examples/show_device_properties.py b/cuda_core/examples/show_device_properties.py index 1609d8c230..8b14cf0767 100644 --- a/cuda_core/examples/show_device_properties.py +++ b/cuda_core/examples/show_device_properties.py @@ -11,7 +11,7 @@ import sys -from cuda.core.experimental import Device, system +from cuda.core import Device, system # Convert boolean to YES or NO string diff --git a/cuda_core/examples/simple_multi_gpu_example.py b/cuda_core/examples/simple_multi_gpu_example.py index ec997a649b..438a21c808 100644 --- a/cuda_core/examples/simple_multi_gpu_example.py +++ b/cuda_core/examples/simple_multi_gpu_example.py @@ -12,7 +12,7 @@ import sys import cupy as cp -from cuda.core.experimental import Device, LaunchConfig, Program, launch, system +from cuda.core import Device, LaunchConfig, Program, launch, system if system.get_num_devices() < 2: print("this example requires at least 2 GPUs", file=sys.stderr) diff --git a/cuda_core/examples/strided_memory_view_cpu.py b/cuda_core/examples/strided_memory_view_cpu.py index de6007fd26..a20377cc76 100644 --- a/cuda_core/examples/strided_memory_view_cpu.py +++ b/cuda_core/examples/strided_memory_view_cpu.py @@ -26,7 +26,7 @@ print("cffi is not installed, the CPU example will be skipped", file=sys.stderr) FFI = None import numpy as np -from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory +from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory # ################################################################################ # diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py index 3e456776a8..e91ddc25cc 100644 --- a/cuda_core/examples/strided_memory_view_gpu.py +++ b/cuda_core/examples/strided_memory_view_gpu.py @@ -23,8 +23,8 @@ print("cupy is not installed, the GPU example will be skipped", file=sys.stderr) cp = None import numpy as np -from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch -from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory +from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch +from cuda.core.utils import StridedMemoryView, args_viewable_as_strided_memory # ################################################################################ # diff --git a/cuda_core/examples/thread_block_cluster.py b/cuda_core/examples/thread_block_cluster.py index e14158f8bd..f1ea8b8579 100644 --- a/cuda_core/examples/thread_block_cluster.py +++ b/cuda_core/examples/thread_block_cluster.py @@ -13,7 +13,7 @@ import sys import numpy as np -from cuda.core.experimental import ( +from cuda.core import ( Device, LaunchConfig, LegacyPinnedMemoryResource, diff --git a/cuda_core/examples/vector_add.py b/cuda_core/examples/vector_add.py index 2851303c7e..d31ab77208 100644 --- a/cuda_core/examples/vector_add.py +++ b/cuda_core/examples/vector_add.py @@ -10,7 +10,7 @@ # ################################################################################ import cupy as cp -from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch +from cuda.core import Device, LaunchConfig, Program, ProgramOptions, launch # compute c = a + b code = """ diff --git a/cuda_core/pixi.lock b/cuda_core/pixi.lock index 5f7368f8f7..16a0d2460f 100644 --- a/cuda_core/pixi.lock +++ b/cuda_core/pixi.lock @@ -1068,7 +1068,7 @@ packages: - cuda-cudart >=13.1.80,<14.0a0 license: Apache-2.0 input: - hash: 34cc0e9528da3d29832a101ecb88d7268d870dcc8b47dd880a3df12d7244e4a0 + hash: cccb645b22f775570680f1a9a62e415a09774e46645523bbd147226681155628 globs: - pyproject.toml - conda: . @@ -1088,7 +1088,7 @@ packages: - python_abi 3.14.* *_cp314 license: Apache-2.0 input: - hash: 34cc0e9528da3d29832a101ecb88d7268d870dcc8b47dd880a3df12d7244e4a0 + hash: cccb645b22f775570680f1a9a62e415a09774e46645523bbd147226681155628 globs: - pyproject.toml - conda: . @@ -1110,7 +1110,7 @@ packages: - cuda-cudart >=13.1.80,<14.0a0 license: Apache-2.0 input: - hash: 34cc0e9528da3d29832a101ecb88d7268d870dcc8b47dd880a3df12d7244e4a0 + hash: cccb645b22f775570680f1a9a62e415a09774e46645523bbd147226681155628 globs: - pyproject.toml - conda: https://conda.anaconda.org/conda-forge/noarch/cuda-crt-dev_linux-64-12.9.86-ha770c72_2.conda diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index af99ddd361..94a9e931cc 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -69,7 +69,7 @@ issues = "https://github.com/NVIDIA/cuda-python/issues/" include = ["cuda.core*"] [tool.setuptools.package-data] -"cuda.core.experimental.include" = ["*.h", "*.hpp", "*.cuh"] +"cuda.core._include" = ["*.h", "*.hpp", "*.cuh"] [tool.setuptools.dynamic] version = { attr = "cuda.core._version.__version__" } diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index 9aaf23498f..114e9af296 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -13,6 +13,8 @@ from cuda import cuda as driver import cuda.core.experimental +from cuda.core import _device +from cuda.core._utils.cuda_utils import handle_return from cuda.core.experimental import ( Device, DeviceMemoryResource, @@ -21,9 +23,7 @@ ManagedMemoryResourceOptions, PinnedMemoryResource, PinnedMemoryResourceOptions, - _device, ) -from cuda.core.experimental._utils.cuda_utils import handle_return def skip_if_pinned_memory_unsupported(device): diff --git a/cuda_core/tests/helpers/buffers.py b/cuda_core/tests/helpers/buffers.py index b4d769eab3..eb02ea9c8b 100644 --- a/cuda_core/tests/helpers/buffers.py +++ b/cuda_core/tests/helpers/buffers.py @@ -3,8 +3,8 @@ import ctypes +from cuda.core._utils.cuda_utils import driver, handle_return from cuda.core.experimental import Buffer, Device, MemoryResource -from cuda.core.experimental._utils.cuda_utils import driver, handle_return from . import libc diff --git a/cuda_core/tests/memory_ipc/test_errors.py b/cuda_core/tests/memory_ipc/test_errors.py index d6280ae0ec..0d847c914f 100644 --- a/cuda_core/tests/memory_ipc/test_errors.py +++ b/cuda_core/tests/memory_ipc/test_errors.py @@ -5,8 +5,8 @@ import pickle import re +from cuda.core._utils.cuda_utils import CUDAError from cuda.core.experimental import Buffer, Device, DeviceMemoryResource, DeviceMemoryResourceOptions -from cuda.core.experimental._utils.cuda_utils import CUDAError CHILD_TIMEOUT_SEC = 20 NBYTES = 64 diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py index 87dc459ffc..aa18e4059e 100644 --- a/cuda_core/tests/memory_ipc/test_peer_access.py +++ b/cuda_core/tests/memory_ipc/test_peer_access.py @@ -4,8 +4,8 @@ import multiprocessing as mp import pytest +from cuda.core._utils.cuda_utils import CUDAError from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions -from cuda.core.experimental._utils.cuda_utils import CUDAError from helpers.buffers import PatternGen CHILD_TIMEOUT_SEC = 20 diff --git a/cuda_core/tests/test_comparable.py b/cuda_core/tests/test_comparable.py index c99963cd23..84f440ae99 100644 --- a/cuda_core/tests/test_comparable.py +++ b/cuda_core/tests/test_comparable.py @@ -8,10 +8,10 @@ across Device, Stream, Event, and Context objects. """ +from cuda.core._context import Context +from cuda.core._event import Event, EventOptions +from cuda.core._stream import StreamOptions from cuda.core.experimental import Device, Stream -from cuda.core.experimental._context import Context -from cuda.core.experimental._event import Event, EventOptions -from cuda.core.experimental._stream import StreamOptions # ============================================================================ # Equality Contract Tests diff --git a/cuda_core/tests/test_context.py b/cuda_core/tests/test_context.py index 4fe35dc18d..133ebb4a5d 100644 --- a/cuda_core/tests/test_context.py +++ b/cuda_core/tests/test_context.py @@ -8,7 +8,7 @@ def test_context_init_disabled(): with pytest.raises(RuntimeError, match=r"^Context objects cannot be instantiated directly\."): - cuda.core.experimental._context.Context() # Ensure back door is locked. + cuda.core._context.Context() # Ensure back door is locked. # ============================================================================ diff --git a/cuda_core/tests/test_cuda_utils.py b/cuda_core/tests/test_cuda_utils.py index b0a0518652..c68f8fb841 100644 --- a/cuda_core/tests/test_cuda_utils.py +++ b/cuda_core/tests/test_cuda_utils.py @@ -4,7 +4,7 @@ import pytest from cuda.bindings import driver, runtime -from cuda.core.experimental._utils import cuda_utils +from cuda.core._utils import cuda_utils def test_driver_cu_result_explanations_health(): diff --git a/cuda_core/tests/test_device.py b/cuda_core/tests/test_device.py index ebdc3e3ac4..5e524d3d63 100644 --- a/cuda_core/tests/test_device.py +++ b/cuda_core/tests/test_device.py @@ -8,13 +8,13 @@ from cuda import cudart as runtime import cuda.core.experimental import pytest +from cuda.core._utils.cuda_utils import ComputeCapability, get_binding_version, handle_return from cuda.core.experimental import Device -from cuda.core.experimental._utils.cuda_utils import ComputeCapability, get_binding_version, handle_return def test_device_init_disabled(): with pytest.raises(RuntimeError, match=r"^DeviceProperties cannot be instantiated directly\."): - cuda.core.experimental._device.DeviceProperties() # Ensure back door is locked. + cuda.core._device.DeviceProperties() # Ensure back door is locked. @pytest.fixture(scope="module") diff --git a/cuda_core/tests/test_event.py b/cuda_core/tests/test_event.py index ec35448619..a99a1448d5 100644 --- a/cuda_core/tests/test_event.py +++ b/cuda_core/tests/test_event.py @@ -17,7 +17,7 @@ def test_event_init_disabled(): with pytest.raises(RuntimeError, match=r"^Event objects cannot be instantiated directly\."): - cuda.core.experimental._event.Event() # Ensure back door is locked. + cuda.core._event.Event() # Ensure back door is locked. def test_timing_success(init_cuda): diff --git a/cuda_core/tests/test_graph.py b/cuda_core/tests/test_graph.py index e988eeebf6..77a2ee2489 100644 --- a/cuda_core/tests/test_graph.py +++ b/cuda_core/tests/test_graph.py @@ -11,6 +11,7 @@ from cuda.bindings import nvrtc except ImportError: from cuda import nvrtc +from cuda.core._utils.cuda_utils import NVRTCError, handle_return from cuda.core.experimental import ( Device, GraphBuilder, @@ -22,7 +23,6 @@ ProgramOptions, launch, ) -from cuda.core.experimental._utils.cuda_utils import NVRTCError, handle_return def _common_kernels(): diff --git a/cuda_core/tests/test_hashable.py b/cuda_core/tests/test_hashable.py index 4aa801866f..28a00605ce 100644 --- a/cuda_core/tests/test_hashable.py +++ b/cuda_core/tests/test_hashable.py @@ -12,10 +12,10 @@ 5. Hash/equality contract compliance (if a == b, then hash(a) must equal hash(b)) """ +from cuda.core._context import Context +from cuda.core._event import Event, EventOptions +from cuda.core._stream import Stream, StreamOptions from cuda.core.experimental import Device -from cuda.core.experimental._context import Context -from cuda.core.experimental._event import Event, EventOptions -from cuda.core.experimental._stream import Stream, StreamOptions # ============================================================================ # Integration Tests diff --git a/cuda_core/tests/test_launcher.py b/cuda_core/tests/test_launcher.py index d2e0a89a28..405e27c111 100644 --- a/cuda_core/tests/test_launcher.py +++ b/cuda_core/tests/test_launcher.py @@ -12,6 +12,8 @@ cp = None import numpy as np import pytest +from cuda.core._memory import _SynchronousMemoryResource +from cuda.core._utils.cuda_utils import CUDAError from cuda.core.experimental import ( Device, DeviceMemoryResource, @@ -21,8 +23,6 @@ ProgramOptions, launch, ) -from cuda.core.experimental._memory import _SynchronousMemoryResource -from cuda.core.experimental._utils.cuda_utils import CUDAError from conftest import skipif_need_cuda_headers @@ -95,7 +95,7 @@ def test_launch_config_cluster_grid_conversion(init_cuda): def test_launch_config_native_conversion(init_cuda): """Test that _to_native_launch_config correctly converts grid from cluster units to block units.""" - from cuda.core.experimental._launch_config import _to_native_launch_config + from cuda.core._launch_config import _to_native_launch_config try: # Test case 1: 1D - Issue #867 example @@ -264,7 +264,7 @@ def test_cooperative_launch(): # # Commented out as this seems to be a sticky error... # config = LaunchConfig(grid=1, block=1) # launch(s, config, ker) - # from cuda.core.experimental._utils.cuda_utils import CUDAError + # from cuda.core._utils.cuda_utils import CUDAError # with pytest.raises(CUDAError) as e: # s.sync() # assert "CUDA_ERROR_LAUNCH_FAILED" in str(e) diff --git a/cuda_core/tests/test_linker.py b/cuda_core/tests/test_linker.py index b7af4b6ab7..b05aa7586b 100644 --- a/cuda_core/tests/test_linker.py +++ b/cuda_core/tests/test_linker.py @@ -3,9 +3,10 @@ # SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE import pytest -from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions, _linker -from cuda.core.experimental._module import ObjectCode -from cuda.core.experimental._utils.cuda_utils import CUDAError +from cuda.core import _linker +from cuda.core._module import ObjectCode +from cuda.core._utils.cuda_utils import CUDAError +from cuda.core.experimental import Device, Linker, LinkerOptions, Program, ProgramOptions ARCH = "sm_" + "".join(f"{i}" for i in Device().compute_capability) diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 23572014bb..2ff844ef93 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -17,6 +17,9 @@ import re import pytest +from cuda.core._dlpack import DLDeviceType +from cuda.core._memory import IPCBufferDescriptor +from cuda.core._utils.cuda_utils import CUDAError, handle_return from cuda.core.experimental import ( Buffer, Device, @@ -34,9 +37,6 @@ from cuda.core.experimental import ( system as ccx_system, ) -from cuda.core.experimental._dlpack import DLDeviceType -from cuda.core.experimental._memory import IPCBufferDescriptor -from cuda.core.experimental._utils.cuda_utils import CUDAError, handle_return from cuda.core.experimental.utils import StridedMemoryView from helpers import IS_WINDOWS from helpers.buffers import DummyUnifiedMemoryResource @@ -149,7 +149,7 @@ def test_package_contents(): "VirtualMemoryResource", ] d = {} - exec("from cuda.core.experimental._memory import *", d) # noqa: S102 + exec("from cuda.core._memory import *", d) # noqa: S102 d = {k: v for k, v in d.items() if not k.startswith("__")} assert sorted(expected) == sorted(d.keys()) diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py index d17cdfd089..4067eb857a 100644 --- a/cuda_core/tests/test_memory_peer_access.py +++ b/cuda_core/tests/test_memory_peer_access.py @@ -3,8 +3,8 @@ import cuda.core.experimental import pytest +from cuda.core._utils.cuda_utils import CUDAError from cuda.core.experimental import DeviceMemoryResource -from cuda.core.experimental._utils.cuda_utils import CUDAError from helpers.buffers import PatternGen, compare_buffer_to_constant, make_scratch_buffer NBYTES = 1024 diff --git a/cuda_core/tests/test_module.py b/cuda_core/tests/test_module.py index 25b8d5dd86..041fe2f8cc 100644 --- a/cuda_core/tests/test_module.py +++ b/cuda_core/tests/test_module.py @@ -7,8 +7,8 @@ import cuda.core.experimental import pytest +from cuda.core._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return from cuda.core.experimental import Device, ObjectCode, Program, ProgramOptions -from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, get_binding_version, handle_return try: import numba @@ -41,17 +41,17 @@ def cuda12_4_prerequisite_check(): def test_kernel_attributes_init_disabled(): with pytest.raises(RuntimeError, match=r"^KernelAttributes cannot be instantiated directly\."): - cuda.core.experimental._module.KernelAttributes() # Ensure back door is locked. + cuda.core._module.KernelAttributes() # Ensure back door is locked. def test_kernel_occupancy_init_disabled(): with pytest.raises(RuntimeError, match=r"^KernelOccupancy cannot be instantiated directly\."): - cuda.core.experimental._module.KernelOccupancy() # Ensure back door is locked. + cuda.core._module.KernelOccupancy() # Ensure back door is locked. def test_kernel_init_disabled(): with pytest.raises(RuntimeError, match=r"^Kernel objects cannot be instantiated directly\."): - cuda.core.experimental._module.Kernel() # Ensure back door is locked. + cuda.core._module.Kernel() # Ensure back door is locked. def test_object_code_init_disabled(): diff --git a/cuda_core/tests/test_multiprocessing_warning.py b/cuda_core/tests/test_multiprocessing_warning.py index 8b490af233..0743b7f71d 100644 --- a/cuda_core/tests/test_multiprocessing_warning.py +++ b/cuda_core/tests/test_multiprocessing_warning.py @@ -12,11 +12,11 @@ import warnings from unittest.mock import patch +from cuda.core._event import _reduce_event +from cuda.core._memory._device_memory_resource import _deep_reduce_device_memory_resource +from cuda.core._memory._ipc import _reduce_allocation_handle +from cuda.core._utils.cuda_utils import reset_fork_warning from cuda.core.experimental import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions -from cuda.core.experimental._event import _reduce_event -from cuda.core.experimental._memory._device_memory_resource import _deep_reduce_device_memory_resource -from cuda.core.experimental._memory._ipc import _reduce_allocation_handle -from cuda.core.experimental._utils.cuda_utils import reset_fork_warning def test_warn_on_fork_method_device_memory_resource(ipc_device): diff --git a/cuda_core/tests/test_program.py b/cuda_core/tests/test_program.py index f432e3f88d..9a9e4926ae 100644 --- a/cuda_core/tests/test_program.py +++ b/cuda_core/tests/test_program.py @@ -6,11 +6,11 @@ import warnings import pytest -from cuda.core.experimental import _linker -from cuda.core.experimental._device import Device -from cuda.core.experimental._module import Kernel, ObjectCode -from cuda.core.experimental._program import Program, ProgramOptions -from cuda.core.experimental._utils.cuda_utils import CUDAError, driver, handle_return +from cuda.core import _linker +from cuda.core._device import Device +from cuda.core._module import Kernel, ObjectCode +from cuda.core._program import Program, ProgramOptions +from cuda.core._utils.cuda_utils import CUDAError, driver, handle_return cuda_driver_version = handle_return(driver.cuDriverGetVersion()) is_culink_backend = _linker._decide_nvjitlink_or_driver() @@ -19,7 +19,7 @@ def _is_nvvm_available(): """Check if NVVM is available.""" try: - from cuda.core.experimental._program import _get_nvvm_module + from cuda.core._program import _get_nvvm_module _get_nvvm_module() return True @@ -32,7 +32,7 @@ def _is_nvvm_available(): ) try: - from cuda.core.experimental._utils.cuda_utils import driver, handle_return, nvrtc + from cuda.core._utils.cuda_utils import driver, handle_return, nvrtc _cuda_driver_version = handle_return(driver.cuDriverGetVersion()) except Exception: @@ -92,7 +92,7 @@ def _get_libnvvm_version_for_tests(): _libnvvm_version_attempted = True try: - from cuda.core.experimental._program import _get_nvvm_module + from cuda.core._program import _get_nvvm_module nvvm = _get_nvvm_module() @@ -140,7 +140,7 @@ def nvvm_ir(): fallback assumes no version metadata will be present in the input nvvm ir """ - from cuda.core.experimental._program import _get_nvvm_module + from cuda.core._program import _get_nvvm_module nvvm = _get_nvvm_module() major, minor, debug_major, debug_minor = nvvm.ir_version() @@ -415,7 +415,7 @@ def test_program_close(): @nvvm_available def test_nvvm_deferred_import(): """Test that our deferred NVVM import works correctly""" - from cuda.core.experimental._program import _get_nvvm_module + from cuda.core._program import _get_nvvm_module nvvm = _get_nvvm_module() assert nvvm is not None diff --git a/cuda_core/tests/test_stream.py b/cuda_core/tests/test_stream.py index 695a70e931..f05a7d3b3a 100644 --- a/cuda_core/tests/test_stream.py +++ b/cuda_core/tests/test_stream.py @@ -2,10 +2,10 @@ # SPDX-License-Identifier: Apache-2.0 import pytest +from cuda.core._event import Event +from cuda.core._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM +from cuda.core._utils.cuda_utils import driver from cuda.core.experimental import Device, Stream, StreamOptions -from cuda.core.experimental._event import Event -from cuda.core.experimental._stream import LEGACY_DEFAULT_STREAM, PER_THREAD_DEFAULT_STREAM -from cuda.core.experimental._utils.cuda_utils import driver from helpers.misc import StreamWrapper diff --git a/cuda_core/tests/test_strided_layout.py b/cuda_core/tests/test_strided_layout.py index c615365cf8..d897f78cfe 100644 --- a/cuda_core/tests/test_strided_layout.py +++ b/cuda_core/tests/test_strided_layout.py @@ -9,7 +9,7 @@ import numpy as np import pytest -from cuda.core.experimental._layout import _StridedLayout +from cuda.core._layout import _StridedLayout from helpers.layout import ( DenseOrder, LayoutSpec, diff --git a/cuda_core/tests/test_system.py b/cuda_core/tests/test_system.py index da81bbec99..d52629ded7 100644 --- a/cuda_core/tests/test_system.py +++ b/cuda_core/tests/test_system.py @@ -7,8 +7,8 @@ from cuda import cuda as driver from cuda import cudart as runtime +from cuda.core._utils.cuda_utils import handle_return from cuda.core.experimental import Device, system -from cuda.core.experimental._utils.cuda_utils import handle_return def test_system_singleton(): diff --git a/cuda_core/tests/test_utils.py b/cuda_core/tests/test_utils.py index 8bb66ef60d..d8c747bdb5 100644 --- a/cuda_core/tests/test_utils.py +++ b/cuda_core/tests/test_utils.py @@ -15,13 +15,13 @@ import cuda.core.experimental import numpy as np import pytest +from cuda.core._layout import _StridedLayout from cuda.core.experimental import Device -from cuda.core.experimental._layout import _StridedLayout from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory def test_cast_to_3_tuple_success(): - c3t = cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple + c3t = cuda.core._utils.cuda_utils.cast_to_3_tuple assert c3t("", ()) == (1, 1, 1) assert c3t("", 2) == (2, 1, 1) assert c3t("", (2,)) == (2, 1, 1) @@ -45,7 +45,7 @@ def test_cast_to_3_tuple_success(): ) def test_cast_to_3_tuple_value_error(cfg, expected): with pytest.raises(ValueError, match=expected): - cuda.core.experimental._utils.cuda_utils.cast_to_3_tuple("Lbl", cfg) + cuda.core._utils.cuda_utils.cast_to_3_tuple("Lbl", cfg) def convert_strides_to_counts(strides, itemsize): diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/__init__.py b/cuda_python_test_helpers/cuda_python_test_helpers/__init__.py index a661b4f1aa..e7829df406 100644 --- a/cuda_python_test_helpers/cuda_python_test_helpers/__init__.py +++ b/cuda_python_test_helpers/cuda_python_test_helpers/__init__.py @@ -9,7 +9,7 @@ from contextlib import suppress from typing import Union -from cuda.core.experimental._utils.cuda_utils import handle_return +from cuda.core._utils.cuda_utils import handle_return __all__ = [ "IS_WINDOWS",