diff --git a/cuda_core/cuda/core/experimental/_layout.pxd b/cuda_core/cuda/core/experimental/_layout.pxd index 301cdaaa65..1596a96af3 100644 --- a/cuda_core/cuda/core/experimental/_layout.pxd +++ b/cuda_core/cuda/core/experimental/_layout.pxd @@ -170,6 +170,12 @@ cdef class StridedLayout: _init_base_layout_from_tuple(base, shape, None) return self._init_dense(base, itemsize, order_flag, &stride_order_vec) + cdef inline StridedLayout copy(StridedLayout self): + cdef StridedLayout new_layout = StridedLayout.__new__(StridedLayout) + new_layout.init_from_ptr(self.base.ndim, self.base.shape, self.base.strides, self.itemsize) + new_layout.slice_offset = self.slice_offset + return new_layout + # ============================== # Properties # ============================== @@ -318,6 +324,18 @@ cdef class StridedLayout: # Layout manipulation # ============================== + cdef inline int make_dense(StridedLayout self, OrderFlag order_flag, axis_vec_t *stride_order_vec) except -1 nogil: + if order_flag == ORDER_C: + _dense_strides_c(self.base) + elif order_flag == ORDER_F: + _dense_strides_f(self.base) + elif order_flag == ORDER_PERM: + _dense_strides_in_order(self.base, deref(stride_order_vec)) + else: + raise ValueError("The stride_order must be 'C', 'F', or a permutation.") + self.slice_offset = 0 + self._prop_mask = 0 + return 0 cdef int reshape_into(StridedLayout self, StridedLayout out_layout, BaseLayout& new_shape) except -1 nogil cdef int permute_into(StridedLayout self, StridedLayout out_layout, axis_vec_t& axis_order) except -1 nogil @@ -375,21 +393,32 @@ cdef inline stride_t *get_strides_ptr(BaseLayout& base) except? NULL nogil: return tmp_strides -cdef inline bint _base_layout_equal(BaseLayout& a, BaseLayout& b) noexcept nogil: +cdef inline bint base_equal_shapes(BaseLayout& a, BaseLayout& b) noexcept nogil: if a.ndim != b.ndim: return False for i in range(a.ndim): if a.shape[i] != b.shape[i]: return False - if a.strides != NULL or b.strides != NULL: - if a.strides == NULL or b.strides == NULL: + return True + + +cdef inline bint _base_equal_strides(BaseLayout& a, BaseLayout& b) noexcept nogil: + if a.strides == NULL or b.strides == NULL: + return a.strides == b.strides + for i in range(a.ndim): + if a.strides[i] != b.strides[i]: return False - for i in range(a.ndim): - if a.strides[i] != b.strides[i]: - return False return True +cdef inline bint base_equal_strides(BaseLayout& a, BaseLayout& b) noexcept nogil: + return a.ndim == b.ndim and _base_equal_strides(a, b) + + +cdef inline bint base_layout_equal(BaseLayout& a, BaseLayout& b) noexcept nogil: + return base_equal_shapes(a, b) and _base_equal_strides(a, b) + + @cython.overflowcheck(True) cdef inline int64_t _volume(BaseLayout& base) except? -1 nogil: cdef int64_t vol = 1 diff --git a/cuda_core/cuda/core/experimental/_layout.pyx b/cuda_core/cuda/core/experimental/_layout.pyx index 65efd4025d..adfc1662a9 100644 --- a/cuda_core/cuda/core/experimental/_layout.pyx +++ b/cuda_core/cuda/core/experimental/_layout.pyx @@ -177,7 +177,7 @@ cdef class StridedLayout: ) def __eq__(self : StridedLayout, other : StridedLayout) -> bool: - return self.itemsize == other.itemsize and self.slice_offset == other.slice_offset and _base_layout_equal(self.base, other.base) + return self.itemsize == other.itemsize and self.slice_offset == other.slice_offset and base_layout_equal(self.base, other.base) @property def ndim(self : StridedLayout): diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx index dc972d912a..b8118a45d4 100644 --- a/cuda_core/cuda/core/experimental/_memoryview.pyx +++ b/cuda_core/cuda/core/experimental/_memoryview.pyx @@ -5,7 +5,11 @@ from ._dlpack cimport * from libc.stdint cimport intptr_t from cuda.core.experimental._layout cimport StridedLayout -from cuda.core.experimental._stream import Stream +from cuda.core.experimental._memory._buffer cimport Buffer +from cuda.core.experimental._strided_copy._copy cimport copy_into_d2d, copy_into_d2h, copy_into_h2d +from cuda.core.experimental._strided_copy._copy_utils cimport get_data_ptr +from cuda.core.experimental._stream cimport Stream +from cuda.core.experimental._strided_copy._copy import CopyAllocatorOptions import functools import warnings @@ -15,9 +19,6 @@ import numpy from cuda.core.experimental._utils.cuda_utils import handle_return, driver - -from cuda.core.experimental._memory import Buffer - # TODO(leofang): support NumPy structured dtypes @@ -160,7 +161,7 @@ cdef class StridedMemoryView: @classmethod def from_buffer( - cls, buffer : Buffer, layout : StridedLayout, + cls, Buffer buffer, StridedLayout layout, dtype : numpy.dtype | None = None, is_readonly : bool = False ) -> StridedMemoryView: @@ -218,7 +219,7 @@ cdef class StridedMemoryView: dlm_tensor.deleter(dlm_tensor) def view( - self, layout : StridedLayout | None = None, dtype : numpy.dtype | None = None + self, StridedLayout layout = None, object dtype = None ) -> StridedMemoryView: """ Creates a new view with adjusted layout and dtype. @@ -236,7 +237,7 @@ cdef class StridedMemoryView: def copy_from( self, other : StridedMemoryView, stream : Stream, - allocator = None, + allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None = None, blocking : bool | None = None, ): """ @@ -253,16 +254,19 @@ cdef class StridedMemoryView: if :attr:`dtype` is not specified). * The destination's layout must be unique (see :meth:`StridedLayout.is_unique`). + It is the user's responsibility to ensure proper current device is set + when calling this method. + Parameters ---------- other : StridedMemoryView The view to copy data from. stream : Stream | None, optional The stream to schedule the copy on. - allocator : MemoryResource | None, optional - If temporary buffers are needed, the specifed memory resources - will be used to allocate the memory. If not specified, default - resources will be used. + allocator : :obj:`~utils.CopyAllocatorOptions` | dict[str, :obj:`~_memory.MemoryResource`] | None, optional + If temporary buffers are needed, the specifed ``allocator.host`` and + ``allocator.device`` memory resources will be used to allocate the memory + on the host and device respectively. blocking : bool | None, optional Whether the call should block until the copy is complete. * ``True``: the ``stream`` is synchronized with the host at the end of the call, @@ -274,11 +278,11 @@ cdef class StridedMemoryView: * for device-to-device, it defaults to ``False`` (non-blocking), * for host-to-device or device-to-host, it defaults to ``True`` (blocking). """ - raise NotImplementedError("Sorry, not supported: copy_from") + copy_into(self, other, stream, allocator, blocking) def copy_to( - self, other : StridedMemoryView, stream : Stream | None = None, - allocator = None, + self, other : StridedMemoryView, Stream stream, + allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None = None, blocking : bool | None = None, ): """ @@ -286,7 +290,7 @@ cdef class StridedMemoryView: For details, see :meth:`copy_from`. """ - raise NotImplementedError("Sorry, not supported: copy_to") + copy_into(other, self, stream, allocator, blocking) @property def layout(self) -> StridedLayout: @@ -342,7 +346,7 @@ cdef class StridedMemoryView: raise ValueError("Cannot infer layout from the exporting object") return self._layout - cdef inline object get_buffer(self): + cdef inline Buffer get_buffer(self): """ Returns Buffer instance with the underlying data. If the SMV was created from a Buffer, it will return the same Buffer instance. @@ -661,13 +665,9 @@ cdef StridedLayout layout_from_cai(object metadata): return layout -cdef inline intptr_t get_data_ptr(object buffer, StridedLayout layout) except? 0: - return (int(buffer.handle)) + layout.get_slice_offset_in_bytes() - - cdef inline int view_buffer_strided( StridedMemoryView view, - object buffer, + Buffer buffer, StridedLayout layout, object dtype, bint is_readonly, @@ -706,3 +706,54 @@ cdef inline int view_buffer_strided( view._layout = layout view._dtype = dtype return 0 + + +cdef int copy_into( + StridedMemoryView dst, + StridedMemoryView src, + Stream stream, + allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None = None, + blocking : bool | None = None, +) except -1: + cdef object dst_dtype = dst.get_dtype() + cdef object src_dtype = src.get_dtype() + if dst_dtype is not None and src_dtype is not None and dst_dtype != src_dtype: + raise ValueError( + f"The destination and source dtypes must be the same, " + f"got {dst_dtype} and {src_dtype}." + ) + if dst.readonly: + raise ValueError("The destination view is readonly.") + cdef bint is_src_device_accessible = src.is_device_accessible + cdef bint is_dst_device_accessible = dst.is_device_accessible + cdef bint is_blocking + cdef int device_id + cdef Buffer dst_buffer = dst.get_buffer() + cdef Buffer src_buffer = src.get_buffer() + cdef StridedLayout dst_layout = dst.get_layout() + cdef StridedLayout src_layout = src.get_layout() + if is_src_device_accessible and is_dst_device_accessible: + device_id = dst.device_id + if src.device_id != device_id: + raise ValueError( + f"The destination and source views must be on the " + f"same device, got {device_id} and {src.device_id}." + ) + is_blocking = blocking if blocking is not None else False + copy_into_d2d(dst_buffer, dst_layout, src_buffer, src_layout, device_id, stream, is_blocking) + return 0 + elif is_src_device_accessible: + device_id = src.device_id + is_blocking = blocking if blocking is not None else True + copy_into_d2h(dst_buffer, dst_layout, src_buffer, src_layout, device_id, stream, allocator, is_blocking) + return 0 + elif is_dst_device_accessible: + device_id = dst.device_id + is_blocking = blocking if blocking is not None else True + copy_into_h2d(dst_buffer, dst_layout, src_buffer, src_layout, device_id, stream, allocator, is_blocking) + return 0 + else: + raise ValueError( + "The host-to-host copy is not supported, " + "at least one of the views must be device-accessible." + ) diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_copy.pxd b/cuda_core/cuda/core/experimental/_strided_copy/_copy.pxd new file mode 100644 index 0000000000..b2e945e4df --- /dev/null +++ b/cuda_core/cuda/core/experimental/_strided_copy/_copy.pxd @@ -0,0 +1,44 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from cuda.core.experimental._stream cimport Stream +from cuda.core.experimental._layout cimport StridedLayout +from cuda.core.experimental._memory._buffer cimport Buffer + +from cuda.core.experimental._memory import MemoryResource + + +cdef int copy_into_d2d( + Buffer dst_buffer, + StridedLayout dst_layout, + Buffer src_buffer, + StridedLayout src_layout, + int device_id, + Stream stream, + bint blocking, +) except -1 + + +cdef int copy_into_d2h( + Buffer dst_buffer, + StridedLayout dst_layout, + Buffer src_buffer, + StridedLayout src_layout, + int device_id, + Stream stream, + allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None, + bint blocking, +) except -1 + + +cdef int copy_into_h2d( + Buffer dst_buffer, + StridedLayout dst_layout, + Buffer src_buffer, + StridedLayout src_layout, + int device_id, + Stream stream, + allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None, + bint blocking, +) except -1 diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_copy.pyx b/cuda_core/cuda/core/experimental/_strided_copy/_copy.pyx new file mode 100644 index 0000000000..736f89a715 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_strided_copy/_copy.pyx @@ -0,0 +1,528 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +cimport cython + +from libc.stdint cimport intptr_t, int64_t + +from cuda.core.experimental._layout cimport axis_vec_t, ORDER_C +from cuda.core.experimental._strided_copy._copy_utils cimport ( + logging_axis_order, logging_memcopy, logging_py_objs, memcpy_async, + volume_in_bytes, maybe_sync, get_data_ptr, + flatten_together, maybe_broadcast_src, + check_itemsize, _view_as_numpy, _view_as_strided, _np_dtype +) +from cuda.core.experimental._strided_copy._cuda_kernel cimport cuda_kernel_copy + +import contextlib +import threading +from dataclasses import dataclass +from cuda.core.experimental._device import Device +from cuda.core.experimental._memory import MemoryResource, DeviceMemoryResource +import numpy as _numpy + + +_thread_local = threading.local() +_current_logger = None + + +@contextlib.contextmanager +def _with_logger(logger): + # Utility meant for debugging and testing purposes. + global _current_logger + _current_logger = logger + yield + _current_logger = None + + +@dataclass +class CopyAllocatorOptions: + host : MemoryResource | None = None + device : DeviceMemoryResource | None = None + + +cdef inline allocator_options(allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None): + if allocator is None or type(allocator) is CopyAllocatorOptions: + return allocator + return CopyAllocatorOptions(**allocator) + + +cdef inline object _numpy_empty( + Buffer host_alloc, + StridedLayout layout, + object logger, +): + """ + The layout must be contiguous in some order (layout.get_is_contiguous_any() is True) + If host_alloc is not None, returns a numpy array being a view on the host_alloc with + shape, itemsize and strides as in the layout. + Otherwise, returns a new numpy array with shape, itemsize and strides as in the layout. + """ + if host_alloc is not None: + return _view_as_numpy(int(host_alloc.handle), volume_in_bytes(layout), layout) + cdef object a = _numpy.empty(layout.get_volume(), dtype=_np_dtype(layout.itemsize)) + return _view_as_strided(a, layout) + + +cdef inline object _numpy_ascontiguousarray( + Buffer host_alloc, + intptr_t data_ptr, + int64_t size, + StridedLayout layout, + object logger, +): + """ + Returns a numpy array with the same shape and itemsize as the layout, + but C-contiguous strides. The data_ptr must be a valid host pointer to + a tensor described with the layout. + The layout is modified in place so that it is C-contiguous and dense. + If host_alloc is provided, copies the data there and returns a view on it. + Otherwise, returns a new numpy array. + """ + cdef object a = _view_as_numpy(data_ptr, size, layout) + if logger is not None: + logger.debug( + f"({layout}) is not contiguous, coalescing H2H copy is needed." + ) + layout.make_dense(ORDER_C, NULL) + if host_alloc is None: + return _numpy.ascontiguousarray(a) + cdef object b = _numpy_empty(host_alloc, layout, logger) + _numpy.copyto(b, a) + return b + + +cdef inline get_device_default_mr(int device_id): + cdef dict device_default_mrs = getattr(_thread_local, "device_default_mrs", None) + if device_default_mrs is None: + device_default_mrs = {} + _thread_local.device_default_mrs = device_default_mrs + cdef mr = device_default_mrs.get(device_id) + if mr is not None: + return mr + + # We're accessing the device's default mr for the first time + # We need to make sure the device's context has ever been set + cdef object current_dev = Device() + cdef int current_dev_id = current_dev.device_id + cdef object dev = Device(device_id) + try: + dev.set_current() + mr = dev.memory_resource + device_default_mrs[device_id] = mr + return mr + finally: + if current_dev_id != device_id: + current_dev.set_current() + + +cdef inline Buffer _device_allocate( + device_allocator : DeviceMemoryResource | None, + int64_t size, + int device_id, + Stream stream, +): + if device_allocator is None: + device_allocator = get_device_default_mr(device_id) + return device_allocator.allocate(size, stream) + + +cdef inline int _copy_into_d2d( + intptr_t dst_ptr, + StridedLayout dst_layout, + intptr_t src_ptr, + StridedLayout src_layout, + int device_id, + intptr_t stream_ptr, + bint blocking, + object logger, +) except -1 nogil: + # Note: this function assumes that layouts were squeezed + # already and can be modified in place (i.e. they are not + # referenced elsewhere, e.g. by StridedMemoryView). + + cdef int64_t size = volume_in_bytes(dst_layout) + if size == 0: + return 0 + + # Normalize the layouts: + # 1. permute the layouts so that the dst layouts order is C-like. + # 2. flatten extents that are mergable in both layouts + cdef axis_vec_t axis_order + dst_layout.get_stride_order(axis_order) + dst_layout.permute_into(dst_layout, axis_order) + src_layout.permute_into(src_layout, axis_order) + flatten_together(dst_layout, src_layout) + if logger is not None: + logging_axis_order("The dst_order is {fst}", logger, axis_order) + logging_py_objs("Normalized layouts: dst {fst}, src {snd}", logger, dst_layout, src_layout) + + # Can we just memcpy? + # We permuted the extents to C-order, so if layouts are dense, they are C-contiguous. + # The precondition is that the shapes are equal, thus the c-contiguity implies equal strides. + if dst_layout.get_is_contiguous_c() and src_layout.get_is_contiguous_c(): + if logger is not None: + logging_memcopy( + "The layouts are contiguous and have the same stride order, we can memcopy and return.\n{msg}", + logger, "D2D", + dst_ptr, dst_layout, + src_ptr, src_layout, + blocking, size, stream_ptr + ) + memcpy_async(dst_ptr, src_ptr, size, stream_ptr) + maybe_sync(stream_ptr, blocking, logger) + return 0 + + if not dst_layout.get_is_unique(): + raise ValueError( + f"The destination layout is non-unique, i.e. some elements " + f"may overlap in memory: {dst_layout}" + ) + cuda_kernel_copy(dst_ptr, dst_layout, src_ptr, src_layout, device_id, stream_ptr, logger) + maybe_sync(stream_ptr, blocking, logger) + return 0 + + +cdef inline int _copy_into_h2d( + intptr_t dst_data_ptr, + StridedLayout dst_layout, + intptr_t src_data_ptr, + StridedLayout src_layout, + int device_id, + Stream stream, + bint blocking, + host_allocator : MemoryResource | None, + device_allocator : DeviceMemoryResource | None, + object logger, +) except -1: + """ + Copies data from host to device, rearranging the data if needed. + In a simplest case of contiguous layouts with matching stride orders, + only H2D memcpy is needed. + Otherwise, up to 2 extra copies (H2H and D2D) may be needed + for 3 different reasons: + * if src is non-contiguous we need a H2H 'coalescing' copy + * if dst is non-contiguous we need a D2D 'scattering' copy + * if the dst and src's stride orders differ, we need + a either H2H or D2D copy to transpose the data. + Moving data around in the device memory should be faster, + so we prefer D2D for transpose-copy, unless coalescing + H2H is needed anyway while D2D for scattering can be avoided. + """ + + # Note, this function assumes that layouts were squeezed + # already and can be modified in place (i.e. they are not + # referenced elsewhere, e.g. by StridedMemoryView). + + cdef intptr_t stream_ptr = stream._handle + cdef int64_t size = volume_in_bytes(dst_layout) + if size == 0: + return 0 + + # Permute the layouts so that src has C-like strides + # (increasing from the right to the left) + cdef axis_vec_t axis_order + src_layout.get_stride_order(axis_order) + dst_layout.permute_into(dst_layout, axis_order) + src_layout.permute_into(src_layout, axis_order) + + # First, make sure the src is contiguous, so that we can memcpy from it. + cdef Buffer host_alloc = None + if host_allocator is not None: + host_alloc = host_allocator.allocate(size, stream) + # A numpy array (either view on host_alloc or a new regular numpy array) + cdef object src_tmp = None + if not src_layout.get_is_contiguous_c(): + if dst_layout.get_is_contiguous_any(): + # We cannot avoid H2H copy, but we can avoid D2D copy + # if we also make sure that the src and dst stride orders match. + # We permute layouts to dst order, so that dst is C-contig + # and the src will be made C-contig by the H2H copy. + dst_layout.get_stride_order(axis_order) + dst_layout.permute_into(dst_layout, axis_order) + src_layout.permute_into(src_layout, axis_order) + # the host allocation may not be stream-aware (in particular, the default numpy is not!) + # so we need to block at least until the H2D is complete to avoid deallocating too early. + blocking = True + src_tmp = _numpy_ascontiguousarray(host_alloc, src_data_ptr, size, src_layout, logger) + src_data_ptr = src_tmp.ctypes.data + + if dst_layout.get_is_contiguous_c(): + # We made sure that src layout is C-contig too, we can just memcopy. + if logger is not None: + logging_memcopy( + "The layouts are contiguous and have the same stride order, we can memcopy and return.\n{msg}", + logger, "H2D", + dst_data_ptr, dst_layout, + src_data_ptr, src_layout, + blocking, size, stream_ptr + ) + with cython.nogil: + memcpy_async(dst_data_ptr, src_data_ptr, size, stream_ptr) + maybe_sync(stream_ptr, blocking, logger) + return 0 + + # Otherwise, either dst is not contiguous or src has a different stride order than dst. + # In either case, src is contiguous in some order, so we can just memcopy + # it to a temporary buffer and then perform a D2D transpose/scatter copy to dst. + cdef Buffer dev_tmp = _device_allocate(device_allocator, size, device_id, stream) + cdef intptr_t dev_tmp_data_ptr = int(dev_tmp.handle) + if logger is not None: + logging_memcopy( + f"First, memcpy into a temporary device buffer:\n{{msg}}\n" + f"Followed by scatter/transpose D2D copy:\n" + f"({dst_data_ptr}, {dst_layout} <- {dev_tmp_data_ptr}, {src_layout})", + logger, "H2D", + dev_tmp_data_ptr, src_layout, + src_data_ptr, src_layout, + blocking, size, stream_ptr + ) + with cython.nogil: + memcpy_async(dev_tmp_data_ptr, src_data_ptr, size, stream_ptr) + _copy_into_d2d( + dst_data_ptr, dst_layout, + dev_tmp_data_ptr, src_layout, + device_id, stream_ptr, + blocking, logger + ) + return 0 + + +cdef inline int _copy_into_d2h( + intptr_t dst_data_ptr, + StridedLayout dst_layout, + intptr_t src_data_ptr, + StridedLayout src_layout, + int device_id, + Stream stream, + bint blocking, + host_allocator : MemoryResource | None, + device_allocator : DeviceMemoryResource | None, + object logger, +) except -1: + """ + Copies data from device to host, rearranging the data if needed. + In a simplest case of contiguous layouts with matching stride orders, + only D2H memcpy is needed. + Otherwise, up to 2 extra copies (D2D and H2H) may be needed: + * if src is non-contiguous we need a D2D 'coalescing' copy + * if dst is non-contiguous we need a H2H 'scattering' copy + * if the dst and src's stride orders differ, we need + a either D2D or H2H copy to transpose the data. + Moving data around in the device memory should be faster, + so we prefer D2D for transpose-copy, unless H2H is needed anyway + for scattering while D2D can be avoided. + """ + + # Note, this function assumes that layouts were squeezed + # already and can be modified in place (i.e. they are not + # referenced elsewhere, e.g. by StridedMemoryView). + + cdef intptr_t stream_ptr = stream._handle + cdef int64_t size = volume_in_bytes(dst_layout) + if size == 0: + return 0 + + if not dst_layout.get_is_unique(): + raise ValueError( + f"The destination layout is non-unique, i.e. some elements " + f"may overlap in memory: {dst_layout}" + ) + + # Permute the layouts so that dst has C-like strides + # (increasing from the right to the left) + cdef axis_vec_t axis_order + dst_layout.get_stride_order(axis_order) + dst_layout.permute_into(dst_layout, axis_order) + src_layout.permute_into(src_layout, axis_order) + + # If, after the permutation to C-like order, dst is still not C-contig, + # we need to H2H scatter the data. + cdef bint is_dst_contig = dst_layout.get_is_contiguous_c() + + cdef Buffer src_tmp = None + cdef intptr_t src_tmp_data_ptr + cdef StridedLayout src_tmp_layout + if ( + # if dst does not require scattering H2H copy, + # run D2D whenever src needs transposing or coalescing + (is_dst_contig and not src_layout.get_is_contiguous_c()) + # otherwise, as H2H is needed anyway, + # run D2D only if src requires coalescing + or not src_layout.get_is_contiguous_any() + ): + # After the copy, the src will be coalesced and + # transposed to dst order. + src_tmp_layout = src_layout.to_dense("C") + src_tmp = _device_allocate(device_allocator, size, device_id, stream) + src_tmp_data_ptr = int(src_tmp.handle) + if logger is not None: + logger.debug( + f"We need to coalesce or transpose the data, " + f"running D2D copy into a temporary device buffer:\n" + f"({src_tmp_data_ptr}, {src_tmp_layout} <- {src_data_ptr}, {src_layout})" + ) + _copy_into_d2d( + src_tmp_data_ptr, + # pass a copy of the dense layout, we'll + # need it later and d2d can modify it in place + src_tmp_layout.copy(), + src_data_ptr, src_layout, + device_id, stream_ptr, + False, logger + ) + src_data_ptr = src_tmp_data_ptr + src_layout = src_tmp_layout + + if is_dst_contig: + # The dst is c-contig. If we run a D2D copy, it made src C-contig too. + # If we didn't run a D2D copy, src must have been C-contig already. + if logger is not None: + logging_memcopy( + "The layouts are contiguous and have the same stride order, we can memcopy and return.\n{msg}", + logger, "D2H", + dst_data_ptr, dst_layout, + src_data_ptr, src_layout, + blocking, size, stream_ptr + ) + with cython.nogil: + memcpy_async(dst_data_ptr, src_data_ptr, size, stream_ptr) + maybe_sync(stream_ptr, blocking, logger) + return 0 + + # Otherwise, we need to D2H copy into a temp host buffer and run + # a H2H copy to scatter or transpose the data. + cdef Buffer host_alloc = None + if host_allocator is not None: + host_alloc = host_allocator.allocate(size, stream) + # A numpy array (either view on host_alloc or a new regular numpy array) + cdef object dst_tmp = _numpy_empty(host_alloc, src_layout, logger) + cdef intptr_t dst_tmp_data_ptr = dst_tmp.ctypes.data + if logger is not None: + logging_memcopy( + f"First memcpy into a temporary host buffer:\n{{msg}}\n" + f"Followed by scatter/transpose H2H copy:\n" + f"({dst_data_ptr}, {dst_layout} <- {dst_tmp_data_ptr}, {src_layout})", + logger, "D2H", + dst_data_ptr, src_layout, + src_data_ptr, src_layout, + blocking, size, stream_ptr + ) + with cython.nogil: + memcpy_async(dst_tmp_data_ptr, src_data_ptr, size, stream_ptr) + maybe_sync(stream_ptr, True, logger) + _numpy.copyto(_view_as_numpy(dst_data_ptr, size, dst_layout), dst_tmp) + return 0 + + +cdef int copy_into_d2d( + Buffer dst_buffer, + StridedLayout dst_layout, + Buffer src_buffer, + StridedLayout src_layout, + int device_id, + Stream stream, + bint blocking, +) except -1: + check_itemsize(dst_layout, src_layout) + src_layout = maybe_broadcast_src(dst_layout, src_layout) + + cdef intptr_t stream_ptr = stream._handle + cdef intptr_t dst_data_ptr = get_data_ptr(dst_buffer, dst_layout) + cdef intptr_t src_data_ptr = get_data_ptr(src_buffer, src_layout) + + # Get rid of all 1-extents, as their strides are irrelevant. + # Make sure the copy of the layouts is passed, as the _copy_into_d2d + # may modify those + cdef StridedLayout squeezed_dst = StridedLayout.__new__(StridedLayout) + cdef StridedLayout squeezed_src = StridedLayout.__new__(StridedLayout) + dst_layout.squeeze_into(squeezed_dst) + src_layout.squeeze_into(squeezed_src) + + return _copy_into_d2d( + dst_data_ptr, squeezed_dst, + src_data_ptr, squeezed_src, + device_id, stream_ptr, + blocking, _current_logger + ) + + +cdef int copy_into_h2d( + Buffer dst_buffer, + StridedLayout dst_layout, + Buffer src_buffer, + StridedLayout src_layout, + int device_id, + Stream stream, + allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None, + bint blocking, +) except -1: + check_itemsize(dst_layout, src_layout) + src_layout = maybe_broadcast_src(dst_layout, src_layout) + + cdef intptr_t dst_data_ptr = get_data_ptr(dst_buffer, dst_layout) + cdef intptr_t src_data_ptr = get_data_ptr(src_buffer, src_layout) + cdef host_allocator = None + cdef device_allocator = None + allocator = allocator_options(allocator) + if allocator is not None: + host_allocator = allocator.host + device_allocator = allocator.device + + # Get rid of all 1-extents, as their strides are irrelevant. + # Make sure the copy of the layouts is passed, as the _copy_into_h2d + # may modify those + cdef StridedLayout squeezed_dst = StridedLayout.__new__(StridedLayout) + cdef StridedLayout squeezed_src = StridedLayout.__new__(StridedLayout) + dst_layout.squeeze_into(squeezed_dst) + src_layout.squeeze_into(squeezed_src) + + return _copy_into_h2d( + dst_data_ptr, squeezed_dst, + src_data_ptr, squeezed_src, + device_id, stream, + blocking, host_allocator, device_allocator, + _current_logger, + ) + + +cdef int copy_into_d2h( + Buffer dst_buffer, + StridedLayout dst_layout, + Buffer src_buffer, + StridedLayout src_layout, + int device_id, + Stream stream, + allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None, + bint blocking, +) except -1: + check_itemsize(dst_layout, src_layout) + src_layout = maybe_broadcast_src(dst_layout, src_layout) + + cdef intptr_t dst_data_ptr = get_data_ptr(dst_buffer, dst_layout) + cdef intptr_t src_data_ptr = get_data_ptr(src_buffer, src_layout) + cdef host_allocator = None + cdef device_allocator = None + allocator = allocator_options(allocator) + if allocator is not None: + host_allocator = allocator.host + device_allocator = allocator.device + + # Get rid of all 1-extents, as their strides are irrelevant. + # Make sure the copy of the layouts is passed, as the _copy_into_d2h + # may modify those + cdef StridedLayout squeezed_dst = StridedLayout.__new__(StridedLayout) + cdef StridedLayout squeezed_src = StridedLayout.__new__(StridedLayout) + dst_layout.squeeze_into(squeezed_dst) + src_layout.squeeze_into(squeezed_src) + + return _copy_into_d2h( + dst_data_ptr, squeezed_dst, + src_data_ptr, squeezed_src, + device_id, stream, + blocking, + host_allocator, device_allocator, + _current_logger + ) diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_copy_utils.pxd b/cuda_core/cuda/core/experimental/_strided_copy/_copy_utils.pxd new file mode 100644 index 0000000000..102642bf08 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_strided_copy/_copy_utils.pxd @@ -0,0 +1,167 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +cimport cython + +cimport cpython +from cpython.memoryview cimport PyMemoryView_FromMemory + +from libc.stdint cimport intptr_t, int64_t + +from cuda.core.experimental._memory._buffer cimport Buffer +from cuda.core.experimental._layout cimport ( + StridedLayout, BaseLayout, axis_vec_t, axes_mask_t, + init_base_layout, _overflow_checked_mul, base_equal_shapes +) +from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN + +from cuda.bindings cimport cydriver +from cuda.bindings.cydriver cimport CUstream, CUdeviceptr + +import numpy as _numpy + + +cdef inline int64_t div_ceil(int64_t a, int64_t b) except? -1 nogil: + return (a + b - 1) // b + + +cdef inline intptr_t get_data_ptr(Buffer buffer, StridedLayout layout) except? 0: + return (int(buffer.handle)) + layout.get_slice_offset_in_bytes() + + +cdef inline object _np_dtype(int itemsize): + if itemsize == 1: + return _numpy.uint8 + elif itemsize == 2: + return _numpy.uint16 + elif itemsize == 4: + return _numpy.uint32 + elif itemsize == 8: + return _numpy.uint64 + elif itemsize == 16: + return _numpy.complex128 + else: + raise ValueError(f"Unsupported itemsize: {itemsize}") + + +cdef inline _view_as_strided(object array, StridedLayout layout): + """ + Array must be a 1d numpy array with dtype.itemsize == layout.itemsize. + """ + cdef tuple strides = layout.get_strides_in_bytes_tuple() + if strides is None: + return array.reshape(layout.get_shape_tuple(), order='C') + else: + return _numpy.lib.stride_tricks.as_strided( + array, + shape=layout.get_shape_tuple(), + strides=strides + ) + + +cdef inline object _view_as_numpy(intptr_t data_ptr, int64_t size, StridedLayout layout): + """ + Note the returned array is non-owning, it's caller responsibility to keep buffer alive. + """ + cdef object buf = PyMemoryView_FromMemory(data_ptr, size, cpython.PyBUF_WRITE) + cdef object array = _numpy.frombuffer(buf, dtype=_np_dtype(layout.itemsize)) + return _view_as_strided(array, layout) + + +cdef inline int flatten_together(StridedLayout a, StridedLayout b) except -1 nogil: + cdef axes_mask_t axis_mask = a.get_flattened_axis_mask() & b.get_flattened_axis_mask() + a.flatten_into(a, axis_mask) + b.flatten_into(b, axis_mask) + return 0 + + +cdef inline bint vectorize_together(intptr_t dst_ptr, StridedLayout dst, intptr_t src_ptr, StridedLayout src) except -1 nogil: + cdef int max_itemsize = 8 + cdef int itemsize = dst.itemsize + if itemsize >= max_itemsize: + return False + cdef int new_itemsize = dst.get_max_compatible_itemsize(max_itemsize, dst_ptr) + if itemsize >= new_itemsize: + return False + new_itemsize = src.get_max_compatible_itemsize(new_itemsize, src_ptr) + if itemsize >= new_itemsize: + return False + dst.pack_into(dst, new_itemsize, 0, keep_dim=False) + src.pack_into(src, new_itemsize, 0, keep_dim=False) + return True + + +cdef inline int64_t volume_in_bytes(StridedLayout layout) except? -1 nogil: + return _overflow_checked_mul(layout.get_volume(), layout.itemsize) + + +cdef inline int check_itemsize(StridedLayout dst, StridedLayout src) except -1 nogil: + if dst.itemsize != src.itemsize: + raise ValueError( + f"The itemsize of the destination and source layouts must match. " + f"Got dst itemsize:{dst.itemsize} and src itemsize:{src.itemsize}" + ) + return 0 + + +cdef inline StridedLayout maybe_broadcast_src(StridedLayout dst, StridedLayout src): + if base_equal_shapes(dst.base, src.base): + return src + # If the shapes differ, try broadcasting the source layout to the destination layout. + cdef StridedLayout new_src = StridedLayout.__new__(StridedLayout) + cdef BaseLayout new_src_base + cdef int dst_ndim = dst.base.ndim + init_base_layout(new_src_base, dst_ndim) + for i in range(dst_ndim): + new_src_base.shape[i] = dst.base.shape[i] + src.broadcast_into(new_src, new_src_base) + return new_src + + +cdef inline int logging_axis_order(str msg, logger, axis_vec_t& fst) except -1 nogil: + with cython.gil: + logger.debug(msg.format(fst=fst)) + return 0 + + +cdef inline int logging_memcopy( + str msg, object logger, str kind, + intptr_t dst_ptr, StridedLayout dst, + intptr_t src_ptr, StridedLayout src, + bint blocking, int64_t size, intptr_t stream_ptr +) except -1 nogil: + with cython.gil: + logger.debug(msg.format(msg=( + f"Launching {kind} {'blocking' if blocking else 'non-blocking'} memcpy of " + f"{size} bytes on stream {stream_ptr}.\n" + f"Dst: {dst_ptr}, {dst} <- src: {src_ptr}, {src}" + ))) + return 0 + + +cdef inline int logging_py_objs(str msg, logger, fst=None, snd=None, third=None) except -1 nogil: + with cython.gil: + logger.debug(msg.format(fst=fst, snd=snd, third=third)) + return 0 + + +cdef inline int memcpy_async(intptr_t dst_ptr, intptr_t src_ptr, size_t size, intptr_t stream_ptr) except -1 nogil: + HANDLE_RETURN( + cydriver.cuMemcpyAsync( + dst_ptr, + src_ptr, + size, + stream_ptr + ) + ) + return 0 + + +cdef inline int maybe_sync(intptr_t stream_ptr, bint blocking, object logger) except -1 nogil: + if blocking: + if logger is not None: + with cython.gil: + logger.debug(f"Syncing stream {stream_ptr}.") + HANDLE_RETURN(cydriver.cuStreamSynchronize(stream_ptr)) + return 0 diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_cuda_kernel.pxd b/cuda_core/cuda/core/experimental/_strided_copy/_cuda_kernel.pxd new file mode 100644 index 0000000000..a033007b75 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_strided_copy/_cuda_kernel.pxd @@ -0,0 +1,20 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +cimport cython + +from libc.stdint cimport intptr_t + +from cuda.core.experimental._layout cimport StridedLayout + + +cdef int cuda_kernel_copy( + intptr_t dst_ptr, + StridedLayout dst, + intptr_t src_ptr, + StridedLayout src, + int device_id, + intptr_t stream_ptr, + object logger, +) except -1 nogil diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_cuda_kernel.pyx b/cuda_core/cuda/core/experimental/_strided_copy/_cuda_kernel.pyx new file mode 100644 index 0000000000..93307c0985 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_strided_copy/_cuda_kernel.pyx @@ -0,0 +1,175 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +cimport cython + +from libc.stdint cimport intptr_t, int64_t +from libcpp.memory cimport unique_ptr +from libcpp.functional cimport function + +from cuda.bindings cimport cydriver +from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN +from cuda.core.experimental._layout cimport StridedLayout, axes_mask_t, flatten_all_axes_mask, get_strides_ptr +from cuda.core.experimental._strided_copy._jit cimport get_kernel +from cuda.core.experimental._strided_copy._copy_utils cimport logging_py_objs, div_ceil, vectorize_together + +ctypedef unique_ptr[void, function[void(void*)]] opaque_args_t + + +cdef extern from "limits.h": + cdef int INT_MAX + cdef int INT_MIN + + +cdef extern from "include/strided_copy_utils.hpp": + void _get_strided_copy_args( + opaque_args_t& args, + void *dst_ptr, const void *src_ptr, + int dst_ndim, int src_ndim, + int64_t* dst_shape, int64_t* src_shape, + int64_t* dst_strides, int64_t* src_strides, + int64_t grid_arg) except + nogil + + +cdef inline int get_kernel_args( + opaque_args_t& args, + intptr_t dst_ptr, StridedLayout dst, + intptr_t src_ptr, StridedLayout src, + int64_t grid_arg +) except-1 nogil: + _get_strided_copy_args( + args, + dst_ptr, src_ptr, + dst.base.ndim, src.base.ndim, + dst.base.shape, src.base.shape, + get_strides_ptr(dst.base), get_strides_ptr(src.base), + grid_arg + ) + return 0 + + +cdef inline bint needs_wide_strides(int64_t grid_volume, StridedLayout dst, StridedLayout src) except?-1 nogil: + # grid_volume, i.e the block_size * num_blocks + if grid_volume > INT_MAX: + return True + cdef int64_t dst_min_offset = 0 + cdef int64_t dst_max_offset = 0 + cdef int64_t src_min_offset = 0 + cdef int64_t src_max_offset = 0 + dst.get_offset_bounds(dst_min_offset, dst_max_offset) + src.get_offset_bounds(src_min_offset, src_max_offset) + cdef int64_t min_offset = min(dst_min_offset, src_min_offset) + cdef int64_t max_offset = max(dst_max_offset, src_max_offset) + # forbid INT_MIN too so that: + # 1. abs() is safe + # 2. the INT_MIN can be used as special-value/sentinel in the kernel + return min_offset <= INT_MIN or max_offset > INT_MAX + + +cdef str emit_elementwise_kernel_code(StridedLayout dst, StridedLayout src, bint has_wide_strides, bint has_grid_stride_loop): + cdef str stride_t_str = "int64_t" if has_wide_strides else "int32_t" + cdef str has_grid_stride_loop_str = "true" if has_grid_stride_loop else "false" + kernel_code = f""" + #include "elementwise.h" + ELEMENTWISE_KERNEL({stride_t_str}, {dst.base.ndim}, {src.base.ndim}, {dst.itemsize}, {has_grid_stride_loop_str}) + """ + return kernel_code + + +cdef inline intptr_t get_elementwise_copy_kernel( + StridedLayout dst, StridedLayout src, + bint has_wide_strides, bint has_grid_stride_loop, + int device_id, object logger +) except? 0: + cdef str kernel_code = emit_elementwise_kernel_code(dst, src, has_wide_strides, has_grid_stride_loop) + cdef intptr_t kernel_ptr = get_kernel(kernel_code, device_id, logger) + return kernel_ptr + + +cdef inline int adjust_layouts_for_elementwise_copy(StridedLayout dst, StridedLayout src, object logger) except -1 nogil: + # We want the layouts to keep the same shapes, so that, in cuda kernel, + # we have to unravel flat element index only once. + # The exception is if one of the layouts is flattened to 1D, + # as those don't require unraveling. + cdef int ndim = dst.base.ndim + if ndim == 1: + return 0 + cdef axes_mask_t all_extents = flatten_all_axes_mask(ndim) + cdef axes_mask_t dst_mask = dst.get_flattened_axis_mask() + cdef axes_mask_t src_mask = src.get_flattened_axis_mask() + if dst_mask == all_extents or src_mask == all_extents: + if dst_mask == all_extents: + dst.flatten_into(dst, dst_mask) + if src_mask == all_extents: + src.flatten_into(src, src_mask) + if logger is not None: + logging_py_objs("At least one of the layouts is flattened to 1D: dst {fst}, src {snd}", logger, dst, src) + return 0 + + +cdef int launch_elementwise_copy( + intptr_t dst_ptr, StridedLayout dst, + intptr_t src_ptr, StridedLayout src, + int block_size, int device_id, + intptr_t stream_ptr, object logger +) except -1 nogil: + cdef int64_t volume = dst.get_volume() + cdef int64_t num_logical_blocks = div_ceil(volume, block_size) + cdef int64_t cuda_num_blocks = min(num_logical_blocks, INT_MAX) + cdef bint has_grid_stride_loop = cuda_num_blocks != num_logical_blocks + cdef bint has_wide_strides = needs_wide_strides(num_logical_blocks * block_size, dst, src) + cdef opaque_args_t args + get_kernel_args(args, dst_ptr, dst, src_ptr, src, volume) + cdef void* args_ptr = args.get() + cdef intptr_t kernel_fn_ptr + with cython.gil: + kernel_fn_ptr = get_elementwise_copy_kernel(dst, src, has_wide_strides, has_grid_stride_loop, device_id, logger) + if logger is not None: + logger.debug( + f"Launching elementwise copy kernel {kernel_fn_ptr} " + f"with grid.x={cuda_num_blocks}, block.x={block_size}." + ) + HANDLE_RETURN(cydriver.cuLaunchKernel( + kernel_fn_ptr, + cuda_num_blocks, 1, 1, + block_size, 1, 1, + 0, # shared_mem_size + stream_ptr, + &args_ptr, + NULL + )) + return 0 + + +cdef inline int elementwise_copy( + intptr_t dst_ptr, StridedLayout dst, + intptr_t src_ptr, StridedLayout src, + int device_id, intptr_t stream_ptr, object logger +) except -1 nogil: + cdef int block_size = 128 + adjust_layouts_for_elementwise_copy(dst, src, logger) + launch_elementwise_copy(dst_ptr, dst, src_ptr, src, block_size, device_id, stream_ptr, logger) + return 0 + + +cdef int cuda_kernel_copy( + intptr_t dst_ptr, + StridedLayout dst, + intptr_t src_ptr, + StridedLayout src, + int device_id, + intptr_t stream_ptr, + object logger, +) except -1 nogil: + # the dst and layouts must be already validated and normalized, i.e.: + # * the shapes must be equal + # * the dst stride order must be C-like + # * implicit C-strides are not allowed (i.e. the strides must not be NULL) + # * the volume should be >= 2 + # * there should not be any extents equal to 1 + # * the layouts should be flattened together + if vectorize_together(dst_ptr, dst, src_ptr, src) and logger is not None: + logging_py_objs("Vectorized the layouts: dst {fst}, src {snd}", logger, dst, src) + elementwise_copy(dst_ptr, dst, src_ptr, src, device_id, stream_ptr, logger) + return 0 diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_jit.pxd b/cuda_core/cuda/core/experimental/_strided_copy/_jit.pxd new file mode 100644 index 0000000000..e8940d8dec --- /dev/null +++ b/cuda_core/cuda/core/experimental/_strided_copy/_jit.pxd @@ -0,0 +1,7 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport intptr_t + +cdef intptr_t get_kernel(str kernel_code, int device_id, object logger) except? 0 diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_jit.pyx b/cuda_core/cuda/core/experimental/_strided_copy/_jit.pyx new file mode 100644 index 0000000000..fd46da4095 --- /dev/null +++ b/cuda_core/cuda/core/experimental/_strided_copy/_jit.pyx @@ -0,0 +1,108 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + +from libc.stdint cimport intptr_t + +import os +import threading + +from cuda.core.experimental._device import Device +from cuda.core.experimental._program import Program, ProgramOptions + + +_tls = threading.local() + +# In multithreaded environment we share the compiled and loaded modules between threads. +# Each thread has its own cache mapping arch -> kernel_code_str -> Kernel ptr, +# on a cache miss, we first take a look into the shared cache guarded with _kernel_lock +# and eventually compile if needed. +_kernel_lock = threading.Lock() +_kernel_cache = {} # arch -> kernel_code_str -> Kernel + + +cpdef str get_strided_copy_include_dir(object logger): + """ + Finds and caches the absolute path for the strided copy includes. + """ + # TODO(ktokarski) Once Program API supports passing includes as strings and names, + # read all the headers once and cache them. + cdef str strided_copy_include_dir = getattr(_tls, "strided_copy_include_dir", None) + if strided_copy_include_dir is not None: + return strided_copy_include_dir + cdef str current_dir = os.path.dirname(os.path.abspath(__file__)) + cdef str copy_kernel_dir = os.path.normpath(os.path.join(current_dir, os.pardir, "include", "strided_copy")) + _tls.strided_copy_include_dir = copy_kernel_dir + if logger is not None: + logger.debug(f"Strided copy include dir: {copy_kernel_dir}") + return copy_kernel_dir + + +cdef inline str get_device_arch(int device_id): + # device_id -> arch + cdef dict device_ccs = getattr(_tls, "device_ccs", None) + if device_ccs is None: + device_ccs = {} + _tls.device_ccs = device_ccs + cdef str arch = device_ccs.get(device_id) + if arch is None: + arch = f"sm_{Device(device_id).arch}" + device_ccs[device_id] = arch + return arch + + +cdef compile_load_kernel(str kernel_code, str arch, object logger): + cdef str include_dir = get_strided_copy_include_dir(logger) + cdef options = ProgramOptions(arch=arch, include_path=include_dir) + cdef program = Program(kernel_code, code_type="c++", options=options) + cdef object_code = program.compile("cubin") + cdef kernel = object_code.get_kernel("execute") + return kernel + + +cdef inline intptr_t _get_or_compile_kernel(str kernel_code, str arch, object logger) except? 0: + cdef dict cc_cache = _kernel_cache.get(arch) + if cc_cache is None: + cc_cache = {} + _kernel_cache[arch] = cc_cache + + cdef kernel_obj = cc_cache.get(kernel_code) + if kernel_obj is None: + kernel_obj = compile_load_kernel(kernel_code, arch, logger) + cc_cache[kernel_code] = kernel_obj + if logger is not None: + logger.debug(f"Stored kernel ({kernel_obj}) (arch={arch}) in global cache.\n{kernel_code}") + elif logger is not None: + logger.debug(f"Loaded kernel ({kernel_obj}) (arch={arch}) from global cache.\n{kernel_code}") + return int(kernel_obj._handle) + + +cdef inline intptr_t get_or_compile_kernel(str kernel_code, str arch, object logger) except? 0: + with _kernel_lock: + return _get_or_compile_kernel(kernel_code, arch, logger) + + +cdef intptr_t get_kernel(str kernel_code, int device_id, object logger) except? 0: + """ + Returns a pointer to the kernel function for a given kernel code and device id. + + In multithreaded environment, each thread has its own cache with pointers to the loaded + modules, if the cache is not populated, the shared cache guarded with _kernel_lock is used. + """ + cdef str arch = get_device_arch(device_id) + cdef dict local_kernel_cache = getattr(_tls, "local_kernel_cache", None) + if local_kernel_cache is None: + local_kernel_cache = {} + _tls.local_kernel_cache = local_kernel_cache + cdef dict local_cc_cache = local_kernel_cache.get(arch) + if local_cc_cache is None: + local_cc_cache = {} + local_kernel_cache[arch] = local_cc_cache + + cdef kernel_ptr = local_cc_cache.get(kernel_code) + if kernel_ptr is None: + kernel_ptr = get_or_compile_kernel(kernel_code, arch, logger) + local_cc_cache[kernel_code] = kernel_ptr + elif logger is not None: + logger.debug(f"Loaded kernel ({kernel_ptr}) for device {device_id=} ({arch=}) from thread local cache.\n{kernel_code}") + return kernel_ptr diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/args.h b/cuda_core/cuda/core/experimental/include/strided_copy/args.h new file mode 100644 index 0000000000..64605f106b --- /dev/null +++ b/cuda_core/cuda/core/experimental/include/strided_copy/args.h @@ -0,0 +1,37 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// All rights reserved. +// +// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +#ifndef CUDA_CORE_STRIDED_COPY_ARGS_H +#define CUDA_CORE_STRIDED_COPY_ARGS_H + +#include "impl/type_utils.h" + +#if defined(_MSC_VER) +// For Visual Studio, use __restrict +#define RESTRICT __restrict +#elif defined(__GNUC__) || defined(__clang__) +// For GCC and Clang, use __restrict__ +#define RESTRICT __restrict__ +#else +// Fallback for other compilers, or if restrict is not supported +#define RESTRICT +#endif + +namespace cuda_core +{ +template +struct KernelArgs +{ + void *RESTRICT dst_ptr; + const void *RESTRICT src_ptr; + int64_t dst_shape[N]; + int64_t src_shape[N]; + int64_t dst_strides[N]; + int64_t src_strides[N]; + int64_t grid_arg; +}; +} // namespace cuda_core + +#endif // CUDA_CORE_STRIDED_COPY_ARGS_H diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/elementwise.h b/cuda_core/cuda/core/experimental/include/strided_copy/elementwise.h new file mode 100644 index 0000000000..0578934378 --- /dev/null +++ b/cuda_core/cuda/core/experimental/include/strided_copy/elementwise.h @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// All rights reserved. +// +// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +#ifndef CUDA_CORE_STRIDED_COPY_ELEMENTWISE_H +#define CUDA_CORE_STRIDED_COPY_ELEMENTWISE_H + +#include "args.h" +#include "impl/array_view.h" +#include "impl/elementwise.h" +#include "impl/type_utils.h" +#include "impl/utils.h" +#include "impl/vec.h" + +#define ELEMENTWISE_KERNEL(stride_t, dst_ndim, src_ndim, itemsize, \ + needs_grid_stride_loop) \ + extern "C" \ + { \ + constexpr int max_ndim = dst_ndim > src_ndim ? dst_ndim : src_ndim; \ + void __global__ execute(const cuda_core::KernelArgs args) \ + { \ + cuda_core::elementwise_copy \ + kernel; \ + kernel(args); \ + } \ + } + +namespace cuda_core +{ + +template +struct elementwise_copy +{ + using dtype_t = opaque_t; + using dst_coords_t = vec; + using src_coords_t = vec; + using dst_array_view_t = array_view; + using src_array_view_t = array_view; + using grid_indexer_t = element_indexer; + constexpr static bool has_equal_shapes = dst_ndim == src_ndim; + constexpr static int ndim = dst_ndim > src_ndim ? dst_ndim : src_ndim; + + DEV void operator()(const KernelArgs args) const + { + dst_array_view_t dst_array_view{static_cast(args.dst_ptr), + dst_coords_t{args.dst_shape}, + dst_coords_t{args.dst_strides}}; + src_array_view_t src_array_view{static_cast(args.src_ptr), + src_coords_t{args.src_shape}, + src_coords_t{args.src_strides}}; + auto kernel = elementwise_copy_impl{}; + kernel(std::move(dst_array_view), std::move(src_array_view), + grid_indexer_t{static_cast(args.grid_arg)}); + } +}; + +} // namespace cuda_core + +#endif // CUDA_CORE_STRIDED_COPY_ELEMENTWISE_H diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/impl/array_view.h b/cuda_core/cuda/core/experimental/include/strided_copy/impl/array_view.h new file mode 100644 index 0000000000..b0e2cef8d1 --- /dev/null +++ b/cuda_core/cuda/core/experimental/include/strided_copy/impl/array_view.h @@ -0,0 +1,63 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// All rights reserved. +// +// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +#ifndef CUDA_CORE_STRIDED_COPY_IMPL_ARRAY_VIEW_H_ +#define CUDA_CORE_STRIDED_COPY_IMPL_ARRAY_VIEW_H_ + +#include "utils.h" +#include "vec.h" + +#if defined(_MSC_VER) +// For Visual Studio, use __restrict +#define RESTRICT __restrict +#elif defined(__GNUC__) || defined(__clang__) +// For GCC and Clang, use __restrict__ +#define RESTRICT __restrict__ +#else +// Fallback for other compilers, or if restrict is not supported +#define RESTRICT +#endif + +namespace cuda_core +{ + +template +struct array_view +{ + // While indices cannot be negative (only strides can), + // we're using the same 32- or 64-bit signed type to represent both + // indices and strides for simplicity. In the end we need to convert + // both to the same signed type when computing the offset. + using coords_t = _coords_t; + using stride_t = typename coords_t::type; + using dtype_t = T; + static constexpr int ndim = coords_t::ndim; + + HOST_DEV constexpr array_view(T *__restrict__ data, coords_t shape, + coords_t strides) + : shape_(shape), strides_(strides), data_(data) {} + + HOST_DEV T &operator[](const coords_t idx) const + { + return data_[offset(idx)]; + } + HOST_DEV T &operator[](const stride_t offset) const { return data_[offset]; } + HOST_DEV stride_t offset(const coords_t idx) const + { + return dot(idx, strides()); + } + HOST_DEV coords_t shape() const { return shape_; } + HOST_DEV coords_t strides() const { return strides_; } + HOST_DEV T *data() const { return data_; } + +protected: + coords_t shape_; + coords_t strides_; + T *RESTRICT data_; +}; + +} // namespace cuda_core + +#endif // CUDA_CORE_STRIDED_COPY_IMPL_ARRAY_VIEW_H_ diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/impl/elementwise.h b/cuda_core/cuda/core/experimental/include/strided_copy/impl/elementwise.h new file mode 100644 index 0000000000..a0f8d49c02 --- /dev/null +++ b/cuda_core/cuda/core/experimental/include/strided_copy/impl/elementwise.h @@ -0,0 +1,128 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// All rights reserved. +// +// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +#ifndef CUDA_CORE_STRIDED_COPY_IMPL_ELEMENTWISE_H +#define CUDA_CORE_STRIDED_COPY_IMPL_ELEMENTWISE_H + +#include "array_view.h" +#include "type_utils.h" +#include "utils.h" +#include "vec.h" + +namespace cuda_core +{ + +namespace detail +{ + +template +DEV coords_t unravel_idx(const stride_t flat_idx, const coords_t shape) +{ + constexpr int ndim = coords_t::ndim; + if constexpr (ndim <= 0) + { + return {}; + } + else if constexpr (ndim == 1) + { + return {flat_idx}; + } + else if constexpr (ndim > 1) + { + + // the extents cannot be negative and the arithmetic on unsigned integer + // is noticeably faster + using u_stride_t = typename type_traits::unsign::type; + u_stride_t u_flat_idx = flat_idx; + coords_t unraveled_coords; +#pragma unroll + for (int i = ndim - 1; i >= 1; i--) + { + u_stride_t extent = shape[i]; + if (extent & (extent - 1)) + { + u_stride_t next_flat_idx = u_flat_idx / extent; + unraveled_coords[i] = u_flat_idx - next_flat_idx * extent; + u_flat_idx = next_flat_idx; + } + else + { + unraveled_coords[i] = u_flat_idx & (extent - 1); + u_flat_idx >>= ffs(extent) - 1; + } + } + unraveled_coords[0] = u_flat_idx; + return unraveled_coords; + } +} + +} // namespace detail + +template +struct element_indexer +{ + // stride_t can be 32-bit integer for tensor_volume and gridDim * blockDim up + // to INT_MAX, this way unsigned x < INT_MAX; x += INT_MAX cannot overflow + using ustride_t = typename type_traits::unsign::type; + static constexpr bool needs_grid_stride_loop = _needs_grid_stride_loop; + + constexpr HOST_DEV element_indexer(const stride_t tensor_volume) + : tensor_volume(tensor_volume) {} + + template + DEV void with_grid_stride_loop(Cb &&cb) const + { + // early cast the special indexing variables to the desired integer width + // type to avoid arithmetic on 32-bit integers when 64-bit stride_t is used + const ustride_t thread_idx = threadIdx.x; + const ustride_t block_idx = blockIdx.x; + const ustride_t block_dim = blockDim.x; + if constexpr (!needs_grid_stride_loop) + { + const ustride_t x = block_idx * block_dim + thread_idx; + if (x < tensor_volume) + { + cb(x); + } + } + else if constexpr (needs_grid_stride_loop) + { + const ustride_t grid_dim = gridDim.x; + const ustride_t grid_size = grid_dim * block_dim; + for (ustride_t x = block_idx * block_dim + thread_idx; x < tensor_volume; + x += grid_size) + { + cb(x); + } + } + } + + ustride_t tensor_volume; +}; + +template +struct elementwise_copy_impl +{ + using stride_t = typename dst_array_view_t::stride_t; + + DEV void operator()(const dst_array_view_t &&dst_view, + const src_array_view_t &&src_view, + const grid_indexer_t &&grid_helper) + { + grid_helper.with_grid_stride_loop([=](const stride_t flat_element_idx) + { + const auto dst_coords = + detail::unravel_idx(flat_element_idx, dst_view.shape()); + const auto src_coords = + cond_val(bconst(), dst_coords, + detail::unravel_idx(flat_element_idx, src_view.shape())); + dst_view[dst_coords] = src_view[src_coords]; }); + } +}; + +} // namespace cuda_core + +#endif // CUDA_CORE_STRIDED_COPY_IMPL_ELEMENTWISE_H diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/impl/type_utils.h b/cuda_core/cuda/core/experimental/include/strided_copy/impl/type_utils.h new file mode 100644 index 0000000000..72d0b58c1e --- /dev/null +++ b/cuda_core/cuda/core/experimental/include/strided_copy/impl/type_utils.h @@ -0,0 +1,47 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// All rights reserved. +// +// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +#ifndef CUDA_CORE_STRIDED_COPY_IMPL_TYPE_UTILS_H_ +#define CUDA_CORE_STRIDED_COPY_IMPL_TYPE_UTILS_H_ + +namespace cuda_core +{ +using int32_t = int; +using uint32_t = unsigned int; +using int64_t = long long int; +using uint64_t = unsigned long long int; +static_assert(sizeof(int32_t) == 4, "int32_t must be 4 bytes"); +static_assert(sizeof(uint32_t) == 4, "uint32_t must be 4 bytes"); +static_assert(sizeof(int64_t) == 8, "int64_t must be 8 bytes"); +static_assert(sizeof(uint64_t) == 8, "uint64_t must be 8 bytes"); + +// Use a struct to represent type of element so that we don't rely +// on actual representation of the type, available arithmetic etc. +template +struct alignas(n_bytes) opaque_t +{ + char data[n_bytes]; +}; + +static_assert(sizeof(opaque_t<1>) == 1, "opaque_t<1> must be 1 byte"); +static_assert(sizeof(opaque_t<2>) == 2, "opaque_t<2> must be 2 bytes"); +static_assert(sizeof(opaque_t<4>) == 4, "opaque_t<4> must be 4 bytes"); +static_assert(sizeof(opaque_t<8>) == 8, "opaque_t<8> must be 8 bytes"); +static_assert(sizeof(opaque_t<16>) == 16, "opaque_t<16> must be 16 bytes"); + +static_assert(alignof(opaque_t<1>) == alignof(unsigned char), + "opaque_t<1> must be 1 byte"); +static_assert(alignof(opaque_t<2>) == alignof(unsigned short), + "opaque_t<2> must be 2 bytes"); +static_assert(alignof(opaque_t<4>) == alignof(unsigned int), + "opaque_t<4> must be 4 bytes"); +static_assert(alignof(opaque_t<8>) == alignof(unsigned long long int), + "opaque_t<8> must be 8 bytes"); +#ifdef __CUDA_ARCH__ +static_assert(alignof(opaque_t<16>) == 16, "opaque_t<16> must be 16 bytes"); +#endif +} // namespace cuda_core + +#endif // CUDA_CORE_STRIDED_COPY_IMPL_TYPE_UTILS_H_ diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/impl/utils.h b/cuda_core/cuda/core/experimental/include/strided_copy/impl/utils.h new file mode 100644 index 0000000000..a43bc72f44 --- /dev/null +++ b/cuda_core/cuda/core/experimental/include/strided_copy/impl/utils.h @@ -0,0 +1,180 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// All rights reserved. +// +// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +#ifndef CUDA_CORE_STRIDED_COPY_IMPL_UTILS_H_ +#define CUDA_CORE_STRIDED_COPY_IMPL_UTILS_H_ + +#include "type_utils.h" + +#if defined(__CUDACC__) +#define HOST_DEV __host__ __device__ __forceinline__ +#define DEV __device__ __forceinline__ +#else +#define HOST_DEV +#define DEV +#endif + +namespace cuda_core +{ + +// Some of stl type traits are not available with nvrtc +namespace type_traits +{ +template +struct conditional +{ +}; + +template +struct conditional +{ + using type = T; +}; + +template +struct conditional +{ + using type = F; +}; + +template +struct enable_if +{ +}; +template +struct enable_if +{ + typedef T type; +}; + +template +struct unsign +{ +}; + +template <> +struct unsign +{ + using type = uint64_t; +}; + +template <> +struct unsign +{ + using type = uint32_t; +}; + +template +struct is_32_or_64_int +{ + static constexpr bool value = false; +}; + +template <> +struct is_32_or_64_int +{ + static constexpr bool value = true; +}; + +template <> +struct is_32_or_64_int +{ + static constexpr bool value = true; +}; + +template +struct min_val +{ +}; + +template <> +struct min_val +{ + static constexpr int32_t value = -2147483648; +}; + +template <> +struct min_val +{ + static constexpr int64_t value = -9223372036854775808LL; +}; +} // namespace type_traits + +template +struct const_val +{ + using type = T; + static constexpr T value = val; +}; + +template +using iconst = const_val; + +template +using bconst = const_val; + +template +HOST_DEV auto constexpr cond_val(bconst, true_val_t &&true_val, + false_val_t &&false_val) +{ + return false_val; +} + +template +HOST_DEV auto constexpr cond_val(bconst, true_val_t &&true_val, + false_val_t &&false_val) +{ + return true_val; +} + +#if defined(__CUDACC__) + +DEV int ffs(uint32_t x) { return __ffs(x); } + +DEV int ffs(int32_t x) { return __ffs(x); } + +DEV int ffs(uint64_t x) { return __ffsll(x); } + +DEV int ffs(int64_t x) { return __ffsll(x); } + +#endif + +HOST_DEV constexpr int log2_floor(const int k) +{ + return k == 1 ? 0 : 1 + log2_floor(k >> 1); +} + +template +struct mod_div +{ + static_assert(k > 0, "k must be positive"); + static_assert((k & (k - 1)) == 0, "k must be a power of 2"); + static constexpr int value = k; + static constexpr int log2 = log2_floor(k); + static constexpr int mask = k - 1; + HOST_DEV constexpr int operator()() const { return k; } +}; + +template +HOST_DEV constexpr T operator/(const T a, const mod_div) +{ + return a >> mod_div::log2; +} + +template +HOST_DEV constexpr T operator%(const T a, const mod_div) +{ + return a & mod_div::mask; +} + +template +HOST_DEV constexpr T operator*(const T a, const mod_div) +{ + return a << mod_div::log2; +} + +} // namespace cuda_core + +#endif // CUDA_CORE_STRIDED_COPY_IMPL_UTILS_H_ diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/impl/vec.h b/cuda_core/cuda/core/experimental/include/strided_copy/impl/vec.h new file mode 100644 index 0000000000..1a75e42d2c --- /dev/null +++ b/cuda_core/cuda/core/experimental/include/strided_copy/impl/vec.h @@ -0,0 +1,212 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. +// All rights reserved. +// +// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +#ifndef CUDA_CORE_STRIDED_COPY_IMPL_VEC_H_ +#define CUDA_CORE_STRIDED_COPY_IMPL_VEC_H_ + +#include "utils.h" + +namespace cuda_core +{ + +template +struct vec_base +{ + T v[N]; + + template ::value && ...)>::type> + HOST_DEV constexpr vec_base(Components... components) : v{T(components)...} {} + + template ::value>::type> + HOST_DEV constexpr vec_base(const U *ptr) + { + for (int i = 0; i < N; i++) + { + v[i] = ptr[i]; + } + } + + HOST_DEV constexpr T &operator[](int i) { return v[i]; } + HOST_DEV constexpr const T &operator[](int i) const { return v[i]; } +}; + +template +struct vec_base +{ +}; + +template +struct vec : vec_base +{ + using base_t = vec_base; + using type = T; + constexpr static int ndim = N; + + constexpr vec() = default; + + template + HOST_DEV constexpr vec(Components... components) : base_t{components...} {} + + template + HOST_DEV constexpr vec(const U *ptr, int ndim) : base_t(ptr, ndim) {} + + HOST_DEV constexpr int size() const { return ndim; } + + template + HOST_DEV constexpr auto last(const iconst) const + { + static_assert(K <= ndim); + return slice(iconst(), iconst()); + } + + template + HOST_DEV constexpr auto first(const iconst) const + { + static_assert(K <= ndim); + return slice(iconst<0>(), iconst()); + } + + template + HOST_DEV constexpr vec slice(const iconst, + const iconst) const + { + static_assert(start >= 0 && end <= ndim); + constexpr int slice_ndim = end - start; + static_assert(slice_ndim >= 0); + if constexpr (slice_ndim != 0) + { + vec result; +#pragma unroll + for (int i = 0; i < slice_ndim; i++) + { + result[i] = this->operator[](start + i); + } + return result; + } + return {}; + } +}; + +template +HOST_DEV constexpr vec cat(const vec a, const vec b) +{ + constexpr int ndim = N + M; + if constexpr (ndim != 0) + { + vec result; + if constexpr (N > 0) + { +#pragma unroll + for (int i = 0; i < N; i++) + { + result[i] = a[i]; + } + } + if constexpr (M > 0) + { +#pragma unroll + for (int i = 0; i < M; i++) + { + result[N + i] = b[i]; + } + } + return result; + } + return {}; +} + +template +HOST_DEV constexpr auto vector_bin_op(const vec a, const vec b, + Op &&op) +{ + static_assert(N > 0, "N must be positive"); + using result_t = decltype(op(a[0], b[0])); + vec result; +#pragma unroll + for (int i = 0; i < N; i++) + { + result[i] = op(a[i], b[i]); + } + return result; +} + +template +HOST_DEV constexpr auto operator+(const vec a, const vec b) +{ + return vector_bin_op(a, b, [](T a, T b) + { return a + b; }); +} + +template +HOST_DEV constexpr auto operator*(const vec a, const vec b) +{ + return vector_bin_op(a, b, [](T a, T b) + { return a * b; }); +} + +template +HOST_DEV constexpr auto operator-(const vec a, const vec b) +{ + return vector_bin_op(a, b, [](T a, T b) + { return a - b; }); +} + +template +HOST_DEV constexpr auto operator/(const vec a, const vec b) +{ + return vector_bin_op(a, b, [](T a, T b) + { return a / b; }); +} + +template +HOST_DEV constexpr bool any(Pred &&pred, const vec a, + const vec... vs) +{ + for (int i = 0; i < N; i++) + { + if (pred(a[i], vs[i]...)) + return true; + } + return false; +} + +template +HOST_DEV constexpr bool all(Pred &&pred, const vec a, + const vec... vs) +{ + for (int i = 0; i < N; i++) + { + if (!pred(a[i], vs[i]...)) + return false; + } + return true; +} + +template +HOST_DEV constexpr T dot(const vec a, const vec b) +{ + if constexpr (N == 0) + { + return 0; + } + else if constexpr (N != 0) + { + T sum = a[0] * b[0]; +#pragma unroll + for (int i = 1; i < N; i++) + { + sum += a[i] * b[i]; + } + return sum; + } +} + +} // namespace cuda_core + +#endif // CUDA_CORE_STRIDED_COPY_IMPL_VEC_H_ diff --git a/cuda_core/cuda/core/experimental/include/strided_copy_utils.hpp b/cuda_core/cuda/core/experimental/include/strided_copy_utils.hpp new file mode 100644 index 0000000000..758952cb82 --- /dev/null +++ b/cuda_core/cuda/core/experimental/include/strided_copy_utils.hpp @@ -0,0 +1,85 @@ +// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// +// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE + +#ifndef CUDA_CORE_STRIDED_COPY_UTILS_HPP +#define CUDA_CORE_STRIDED_COPY_UTILS_HPP + +#include +#include +#include +#include +#include + +#include "layout.hpp" +#include "strided_copy/args.h" + +template +void _get_strided_copy_args_ndim( + std::unique_ptr> &args, + void *dst_ptr, const void *src_ptr, + int dst_ndim, int src_ndim, + int64_t *dst_shape, int64_t *src_shape, + int64_t *dst_strides, int64_t *src_strides, + int64_t grid_arg) +{ + using uptr_t = std::unique_ptr, std::function>; + uptr_t ptr{new cuda_core::KernelArgs, [](void *p) + { delete (static_cast *>(p)); }}; + ptr->dst_ptr = dst_ptr; + ptr->src_ptr = src_ptr; + for (int i = 0; i < dst_ndim; i++) + { + ptr->dst_shape[i] = dst_shape[i]; + ptr->dst_strides[i] = dst_strides[i]; + } + for (int i = 0; i < src_ndim; i++) + { + ptr->src_shape[i] = src_shape[i]; + ptr->src_strides[i] = src_strides[i]; + } + ptr->grid_arg = grid_arg; + args = std::move(ptr); +} + +template +void _call_with_static_ndim(int ndim, Cb &&cb) +{ + if constexpr (i > max_ndim) + { + throw std::runtime_error("unsupported ndim"); + } + else if constexpr (i <= max_ndim) + { + if (ndim == i) + { + cb(std::integral_constant()); + } + else + { + _call_with_static_ndim(ndim, std::move(cb)); + } + } +} + +void inline _get_strided_copy_args( + std::unique_ptr> &args, + void *dst_ptr, const void *src_ptr, + int dst_ndim, int src_ndim, + int64_t *dst_shape, int64_t *src_shape, + int64_t *dst_strides, int64_t *src_strides, + int64_t grid_arg) +{ + int max_ndim = dst_ndim >= src_ndim ? dst_ndim : src_ndim; + _call_with_static_ndim(max_ndim, [&](auto static_ndim_holder) + { + constexpr int static_ndim = decltype(static_ndim_holder)::value; + _get_strided_copy_args_ndim( + args, dst_ptr, src_ptr, + dst_ndim, src_ndim, + dst_shape, src_shape, + dst_strides, src_strides, + grid_arg); }); +} + +#endif // CUDA_CORE_STRIDED_COPY_UTILS_HPP diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/experimental/utils.py index 3227f1eae1..daeecec849 100644 --- a/cuda_core/cuda/core/experimental/utils.py +++ b/cuda_core/cuda/core/experimental/utils.py @@ -1,9 +1,9 @@ # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # # SPDX-License-Identifier: Apache-2.0 - from cuda.core.experimental._layout import StridedLayout # noqa: F401 from cuda.core.experimental._memoryview import ( StridedMemoryView, # noqa: F401 args_viewable_as_strided_memory, # noqa: F401 ) +from cuda.core.experimental._strided_copy._copy import CopyAllocatorOptions # noqa: F401 diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml index af99ddd361..d08701397a 100644 --- a/cuda_core/pyproject.toml +++ b/cuda_core/pyproject.toml @@ -69,7 +69,7 @@ issues = "https://github.com/NVIDIA/cuda-python/issues/" include = ["cuda.core*"] [tool.setuptools.package-data] -"cuda.core.experimental.include" = ["*.h", "*.hpp", "*.cuh"] +"cuda.core.experimental.include" = ["**/*.h", "**/*.hpp", "**/*.cuh"] [tool.setuptools.dynamic] version = { attr = "cuda.core._version.__version__" } diff --git a/cuda_core/tests/test_strided_memory_view_copy.py b/cuda_core/tests/test_strided_memory_view_copy.py new file mode 100644 index 0000000000..347d8f22a9 --- /dev/null +++ b/cuda_core/tests/test_strided_memory_view_copy.py @@ -0,0 +1,639 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 + + +import ctypes +import logging +import random +from contextlib import contextmanager +from enum import Enum +from io import StringIO + +import cuda.core.experimental as ccx +import numpy as np +import pytest +from cuda.core.experimental import Buffer +from cuda.core.experimental import ( + system as ccx_system, +) +from cuda.core.experimental._strided_copy._copy import _with_logger +from cuda.core.experimental.utils import ( + CopyAllocatorOptions, + StridedLayout, + StridedMemoryView, +) + +try: + import cupy as cp +except ImportError: + cp = None + +from helpers.layout import ( + NamedParam, + dtype_from_itemsize, + inv_permutation, + permuted, + pretty_name, +) + + +class CopyDirection(Enum): + D2D = "d2d" + D2H = "d2h" + H2D = "h2d" + + +class SrcFormat(Enum): + C_CONTIGUOUS = "c_contiguous" # the src is contigious in some order + F_CONTIGUOUS = "f_contiguous" # the src is contigious in some order + SLICED = "sliced" # the src is sliced (has gaps) + + +class Transpose(Enum): + SAME_ORDER = False # the src and dst have the same stride order + INVERSE = "inverse" # the src and dst have the inverse stride order (e.g. F <-> C) + PERMUTATION = "permutation" # the src and dst have permuted stride orders + + +class DstFormat(Enum): + DENSE = "dense" # the dst is contigious in some order + SLICED = "sliced" # the dst is sliced (has gaps) + + +class Broadcast(Enum): + NO = False + RIGHT = "right" + LEFT = "left" + RIGHT_LEFT = "right_left" + + +class CustomAllocator(Enum): + HOST = "host" + DEVICE = "device" + BOTH = "host_device" + + +_ITEMSIZES = [1, 2, 4, 8, 16] + +py_rng = random.Random(43) +num_devices = ccx_system.num_devices + + +def get_ptr(array): + if isinstance(array, np.ndarray): + return array.ctypes.data + elif isinstance(array, cp.ndarray): + return array.data.ptr + else: + raise ValueError(f"Invalid array: {type(array)}") + + +@contextmanager +def with_ccx_device(device_id): + current_dev = ccx.Device() + current_dev_id = current_dev.device_id + dev = ccx.Device(device_id) + try: + dev.set_current() + yield dev + finally: + if current_dev_id != device_id: + current_dev.set_current() + + +_log_stream = StringIO() +_logger = logging.Logger("smv_copy_test", level=logging.DEBUG) +_logger.addHandler(logging.StreamHandler(_log_stream)) +_logger.setLevel(logging.DEBUG) + + +@contextmanager +def with_logger(): + try: + with _with_logger(_logger): + yield _log_stream + finally: + _log_stream.truncate(0) + + +def get_src_order(src_format): + assert isinstance(src_format, SrcFormat) + if src_format == SrcFormat.F_CONTIGUOUS: + return "F" + return "C" + + +def get_dst_order(rng, src_format, transpose, shape): + assert isinstance(transpose, Transpose) + src_order = get_src_order(src_format) + if transpose == Transpose.SAME_ORDER: + return src_order + elif transpose == Transpose.INVERSE: + return "F" if src_order == "C" else "C" + else: + assert transpose == Transpose.PERMUTATION + perm = list(range(len(shape))) + rng.shuffle(perm) + return tuple(perm) + + +def get_src_shape(base_src_shape, broadcast): + assert isinstance(broadcast, Broadcast) + match broadcast: + case Broadcast.NO | Broadcast.LEFT: + return base_src_shape + case Broadcast.RIGHT | Broadcast.RIGHT_LEFT: + return base_src_shape + (1,) + raise ValueError(f"Invalid broadcast: {broadcast}") + + +def get_dst_shape(rng, base_src_shape, broadcast): + assert isinstance(broadcast, Broadcast) + match broadcast: + case Broadcast.NO: + return base_src_shape + case Broadcast.RIGHT: + return base_src_shape + (rng.randint(2, 5),) + case Broadcast.RIGHT_LEFT: + return (rng.randint(2, 5),) + base_src_shape + (rng.randint(2, 5),) + case Broadcast.LEFT: + return (rng.randint(2, 5),) + base_src_shape + raise ValueError(f"Invalid broadcast: {broadcast}") + + +def is_h2h_needed(direction, src_shape, src_format, dst_shape, dst_format, broadcast): + if direction == CopyDirection.D2D: + return False + elif direction == CopyDirection.H2D: + return len(src_shape) > 1 and (src_format == SrcFormat.SLICED) or broadcast != Broadcast.NO + else: + assert direction == CopyDirection.D2H + return len(dst_shape) > 1 and (dst_format == DstFormat.SLICED) + + +def get_src(is_host, src_format, src_order, implicit_c, shape, itemsize): + assert isinstance(src_format, SrcFormat) + assert isinstance(implicit_c, bool) + mod = np if is_host else cp + dtype = dtype_from_itemsize(itemsize) + slices = None + if src_format == SrcFormat.SLICED: + shape = tuple(e + 2 for e in shape) + slices = tuple(slice(1, -1) for _ in shape) + array = mod.arange(np.prod(shape), dtype=dtype).reshape(shape, order=src_order) + smv = StridedMemoryView.from_dlpack(array, -1) + # enforce implicit C strides + if implicit_c and smv.layout.is_contiguous_c: + layout = StridedLayout(smv.shape, None, itemsize) + smv = smv.view(layout=layout) + if slices is not None: + base_ptr = smv.ptr + smv = smv.view(layout=smv.layout[slices]) + assert smv.layout.slice_offset != 0 + assert smv.ptr == base_ptr + smv.layout.slice_offset_in_bytes + assert smv.shape != shape + assert smv.layout.volume > 0 + return smv + + +def get_dst( + is_host, + src_order, + transpose, + dst_order, + implicit_c, + dst_format, + dst_shape, + itemsize, +): + assert isinstance(dst_format, DstFormat) + assert isinstance(implicit_c, bool) + mod = np if is_host else cp + dtype = dtype_from_itemsize(itemsize) + slices = None + if dst_format == DstFormat.SLICED: + dst_shape = tuple(e + 2 for e in dst_shape) + slices = tuple(slice(1, -1) for _ in dst_shape) + if transpose == Transpose.SAME_ORDER: + assert src_order in "CF" + assert dst_order == src_order + elif transpose == Transpose.INVERSE: + assert dst_order in "CF" + assert dst_order != src_order + else: + assert transpose == Transpose.PERMUTATION + assert isinstance(dst_order, tuple) + if transpose == Transpose.PERMUTATION: + array = mod.arange(np.prod(dst_shape), dtype=dtype) + array = array.reshape(permuted(dst_shape, dst_order)).transpose(inv_permutation(dst_order)) + else: + array = mod.arange(np.prod(dst_shape), dtype=dtype).reshape(dst_shape, order=dst_order) + smv = StridedMemoryView.from_dlpack(array, -1) + # enforce implicit C strides + if implicit_c and smv.layout.is_contiguous_c: + layout = StridedLayout(smv.shape, None, itemsize) + smv = smv.view(layout=layout) + if slices is not None: + base_ptr = smv.ptr + smv = smv.view(layout=smv.layout[slices]) + assert smv.layout.slice_offset != 0 + assert smv.ptr == base_ptr + smv.layout.slice_offset_in_bytes + assert smv.shape != dst_shape + assert smv.layout.volume > 0 + return smv + + +def as_array(device_id, smv): + min_offset, max_offset = smv.layout.offset_bounds + size = (max_offset - min_offset + 1) * smv.layout.itemsize + dtype = smv.dtype + if dtype is None: + dtype = dtype_from_itemsize(smv.layout.itemsize) + if device_id is None: + c_mem = memoryview((ctypes.c_char * size).from_address(smv.ptr)) + np_array = np.frombuffer(c_mem, dtype=dtype) + if smv.layout.strides_in_bytes is None: + return np_array.reshape(smv.shape, order="C") + else: + return np.lib.stride_tricks.as_strided( + np_array, + shape=smv.shape, + strides=smv.layout.strides_in_bytes, + ) + else: + assert smv.is_device_accessible + um = cp.cuda.UnownedMemory(smv.ptr, size, smv, device_id) + mem = cp.cuda.MemoryPointer(um, 0) + cp_array = cp.ndarray( + shape=smv.shape, + strides=smv.layout.strides_in_bytes, + dtype=dtype, + memptr=mem, + ) + return cp_array + + +@pytest.mark.parametrize( + ( + "src_shape", + "dst_shape", + "direction", + "src_format", + "transpose", + "dst_format", + "implicit_c", + "broadcast", + "itemsize", + "device_id", + "default_stream", + "src_order", + "dst_order", + "copy_from", + "blocking", + ), + [ + ( + NamedParam("src_shape", src_shape), + NamedParam("dst_shape", dst_shape), + direction, + src_format, + transpose, + dst_format, + NamedParam("implicit_c", implicit_c), + broadcast, + NamedParam("itemsize", py_rng.choice(_ITEMSIZES)), + NamedParam( + "device_id", + py_rng.randint(0, num_devices - 1) if num_devices >= 0 else None, + ), + NamedParam("default_stream", py_rng.choice([True, False])), + NamedParam("src_order", get_src_order(src_format)), + NamedParam("dst_order", get_dst_order(py_rng, src_format, transpose, dst_shape)), + NamedParam("copy_from", py_rng.choice([True, False])), + NamedParam("blocking", py_rng.choice([True, False])), + ) + for base_src_shape in [ + tuple(), + (11,), + (16,), + (16, 12), + (1, 16, 1, 12, 1), + (13, 12, 8), + (4, 13, 15), + ] + for direction in list(CopyDirection) + for src_format in list(SrcFormat) + for transpose in list(Transpose) + for dst_format in list(DstFormat) + for implicit_c in [True, False] + if not implicit_c or src_format == SrcFormat.C_CONTIGUOUS + for broadcast in [py_rng.choice([Broadcast.NO, py_rng.choice(list(Broadcast)[1:])])] + for src_shape in [get_src_shape(base_src_shape, broadcast)] + for dst_shape in [get_dst_shape(py_rng, base_src_shape, broadcast)] + if src_format != SrcFormat.SLICED or len(src_shape) > 0 + if transpose == Transpose.SAME_ORDER or len(dst_shape) > 1 + if dst_format != DstFormat.SLICED or len(dst_shape) > 0 + ], + ids=pretty_name, +) +def test_strided_memory_view_copy( + src_shape, + dst_shape, + direction, + src_format, + transpose, + dst_format, + implicit_c, + broadcast, + itemsize, + device_id, + default_stream, + src_order, + dst_order, + copy_from, + blocking, +): + device_id = device_id.value + src_shape = src_shape.value + dst_shape = dst_shape.value + implicit_c = implicit_c.value + itemsize = itemsize.value + default_stream = default_stream.value + src_order = src_order.value + dst_order = dst_order.value + copy_from = copy_from.value + blocking = blocking.value + + if device_id is None: + pytest.skip("No devices available") + if cp is None: + pytest.skip("cupy is not installed") + + assert isinstance(direction, CopyDirection) + is_src_host = direction == CopyDirection.H2D + is_dst_host = direction == CopyDirection.D2H + + cp_stream = None + stream = None + try: + with cp.cuda.Device(device_id): + if default_stream: + stream = ccx.Device(device_id).default_stream + cp_stream = cp.cuda.ExternalStream(int(stream.handle), device_id) + else: + cp_stream = cp.cuda.Stream(non_blocking=True) + stream = ccx.Stream.from_handle(cp_stream.ptr) + + with cp_stream: + src = get_src(is_src_host, src_format, src_order, implicit_c, src_shape, itemsize) + dst = get_dst( + is_dst_host, + src_order, + transpose, + dst_order, + implicit_c, + dst_format, + dst_shape, + itemsize, + ) + + if not is_src_host: + assert src.device_id == device_id + assert src.is_device_accessible + if not is_dst_host: + assert dst.is_device_accessible + assert dst.device_id == device_id + + if broadcast != Broadcast.NO: + assert src.shape != dst.shape + else: + assert src.shape == dst.shape + + with with_ccx_device(device_id), with_logger() as log_stream: + if copy_from: + dst.copy_from(src, stream, blocking=blocking) + else: + src.copy_to(dst, stream, blocking=blocking) + debug_log = log_stream.getvalue() + + if blocking or is_h2h_needed(direction, src_shape, src_format, dst_shape, dst_format, broadcast): + assert f"Syncing stream {int(stream.handle)}" in debug_log + else: + # if no extra H2H is needed we should respect non-blocking flag + assert "Syncing stream" not in debug_log + + src_array = as_array(None if is_src_host else device_id, src) + dst_array = as_array(None if is_dst_host else device_id, dst) + assert src_array.shape == src.shape == src_shape + assert dst_array.shape == dst.shape == dst_shape + if src.layout.strides_in_bytes is None: + dense_strides = StridedLayout.dense(src.shape, src.layout.itemsize).strides_in_bytes + assert src_array.strides == dense_strides + else: + assert src_array.strides == src.layout.strides_in_bytes + if dst.layout.strides_in_bytes is None: + dense_strides = StridedLayout.dense(dst.shape, dst.layout.itemsize).strides_in_bytes + assert dst_array.strides == dense_strides + else: + assert dst_array.strides == dst.layout.strides_in_bytes + + with cp.cuda.Device(device_id): + if not blocking: + stream.sync() + if not is_src_host: + src_array = cp.asnumpy(src_array) + if not is_dst_host: + dst_array = cp.asnumpy(dst_array) + + if broadcast != Broadcast.NO: + src_array = np.broadcast_to(src_array, dst_array.shape) + np.testing.assert_equal(src_array, dst_array) + + finally: + if not default_stream and stream is not None: + stream.close() + + +class CustomDeviceAllocator(ccx.MemoryResource): + def __init__(self, device: ccx.Device): + self.device = device + self._mr = device.memory_resource + self._recorded = [] + + def allocate(self, size, stream=None): + self._recorded.append((size, stream)) + return self._mr.allocate(size, stream) + + def deallocate(self, ptr, size, stream=None): + self._mr.deallocate(ptr, size, stream) + + @property + def is_device_accessible(self) -> bool: + return True + + @property + def is_host_accessible(self) -> bool: + return False + + @property + def device_id(self) -> int: + return self.device.device_id + + +class CustomHostAllocator(ccx.LegacyPinnedMemoryResource): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self._recorded = [] + + def allocate(self, size, stream=None): + self._recorded.append((size, stream)) + return super().allocate(size, stream) + + +@pytest.mark.parametrize( + ("direction", "custom_allocator", "use_dataclass", "dtype", "device_id"), + [ + ( + direction, + custom_allocator, + use_dataclass, + py_rng.choice([np.float16, np.float32, np.float64, np.complex64]), + NamedParam( + "device_id", + py_rng.randint(0, num_devices - 1) if num_devices >= 0 else None, + ), + ) + for direction in [CopyDirection.H2D, CopyDirection.D2H] + for custom_allocator in [ + CustomAllocator.HOST, + CustomAllocator.DEVICE, + CustomAllocator.BOTH, + ] + for use_dataclass in [True, False] + ], + ids=pretty_name, +) +def test_custom_allocator(direction, custom_allocator, use_dataclass, dtype, device_id): + if cp is None: + pytest.skip("cupy is not installed") + + device_id = device_id.value + + assert isinstance(direction, CopyDirection) + is_src_host = direction == CopyDirection.H2D + is_dst_host = direction == CopyDirection.D2H + src_mod = np if is_src_host else cp + dst_mod = np if is_dst_host else cp + + shape = (111, 122) + with cp.cuda.Device(device_id): + a = src_mod.arange(np.prod(shape), dtype=dtype).reshape(shape, order="C") + a = a[:, ::-2] + b = dst_mod.arange(np.prod(shape), dtype=dtype).reshape(shape, order="F") + b = b[::-1, : shape[1] // 2] + + nbytes = a.nbytes + src_buf = Buffer.from_handle(get_ptr(a), nbytes, owner=a) + dst_buf = Buffer.from_handle(get_ptr(b), nbytes, owner=b) + src_layout = StridedLayout(a.shape, a.strides, a.itemsize, divide_strides=True) + dst_layout = StridedLayout(b.shape, b.strides, b.itemsize, divide_strides=True) + src = StridedMemoryView.from_buffer(src_buf, src_layout) + dst = StridedMemoryView.from_buffer(dst_buf, dst_layout) + + host_allocator = None + device_allocator = None + if custom_allocator in [CustomAllocator.HOST, CustomAllocator.BOTH]: + host_allocator = CustomHostAllocator() + if custom_allocator in [CustomAllocator.DEVICE, CustomAllocator.BOTH]: + device_allocator = CustomDeviceAllocator(ccx.Device(device_id)) + + if use_dataclass: + allocator = CopyAllocatorOptions(host=host_allocator, device=device_allocator) + else: + allocator = {k: v for k, v in [("host", host_allocator), ("device", device_allocator)] if v is not None} + + with with_ccx_device(device_id) as dev: + stream = dev.default_stream + dst.copy_from(src, stream, blocking=True, allocator=allocator) + + if host_allocator is not None: + assert host_allocator._recorded == [(nbytes, stream)] + if device_allocator is not None: + assert device_allocator._recorded == [(nbytes, stream)] + + src_array = as_array(None if is_src_host else device_id, src) + dst_array = as_array(None if is_dst_host else device_id, dst) + with cp.cuda.Device(device_id): + if not is_src_host: + src_array = cp.asnumpy(src_array) + if not is_dst_host: + dst_array = cp.asnumpy(dst_array) + np.testing.assert_equal(src_array, dst_array) + + +def test_wrong_shape(): + a = np.arange(10).reshape((2, 5)) + d = ccx.Device(0) + d.set_current() + layout = StridedLayout.dense((2, 6), a.itemsize) + buf = d.memory_resource.allocate(layout.required_size_in_bytes()) + a_view = StridedMemoryView.from_dlpack(a, -1) + b_view = StridedMemoryView.from_buffer(buf, layout) + with pytest.raises(ValueError, match="cannot be broadcast together"): + b_view.copy_from(a_view, d.default_stream, blocking=True) + with pytest.raises(ValueError, match="cannot be broadcast together"): + a_view.copy_from(b_view, d.default_stream, blocking=True) + with pytest.raises(ValueError, match="cannot be broadcast together"): + a_view.copy_to(b_view, d.default_stream, blocking=True) + with pytest.raises(ValueError, match="cannot be broadcast together"): + b_view.copy_to(a_view, d.default_stream, blocking=True) + + +def test_wrong_dtype(): + a = np.arange(10, dtype=np.int32).reshape((2, 5)) + d = ccx.Device(0) + d.set_current() + layout = StridedLayout.dense((2, 5), a.itemsize) + buf = d.memory_resource.allocate(layout.required_size_in_bytes()) + a_view = StridedMemoryView.from_dlpack(a, -1) + b_view = StridedMemoryView.from_buffer(buf, layout, dtype=np.float32) + with pytest.raises(ValueError, match="destination and source dtypes"): + b_view.copy_from(a_view, d.default_stream, blocking=True) + with pytest.raises(ValueError, match="destination and source dtypes"): + a_view.copy_to(b_view, d.default_stream, blocking=True) + + +def test_wrong_itemsize(): + a = np.arange(10, dtype=np.int32).reshape((2, 5)) + d = ccx.Device(0) + d.set_current() + layout = StridedLayout.dense((2, 5), 8) + buf = d.memory_resource.allocate(layout.required_size_in_bytes()) + a_view = StridedMemoryView.from_dlpack(a, -1) + b_view = StridedMemoryView.from_buffer(buf, layout) + with pytest.raises(ValueError, match="itemsize"): + b_view.copy_from(a_view, d.default_stream, blocking=True) + with pytest.raises(ValueError, match="itemsize"): + a_view.copy_to(b_view, d.default_stream, blocking=True) + + +def test_overlapping_dst(): + a = np.arange(10) + a = np.lib.stride_tricks.sliding_window_view(a, 3, -1) + # do this manually, as through dlpack numpy marks the view readonly + buf = Buffer.from_handle(a.ctypes.data, a.nbytes, owner=a) + layout = StridedLayout(a.shape, a.strides, a.itemsize, divide_strides=True) + host_overlapping_view = StridedMemoryView.from_buffer(buf, layout) + layout = host_overlapping_view.layout + d = ccx.Device(0) + d.set_current() + buf = d.memory_resource.allocate(10 * a.itemsize) + dev_overlapping_view = StridedMemoryView.from_buffer(buf, host_overlapping_view.layout) + dense_layout = layout.to_dense() + dense_buf = d.memory_resource.allocate(dense_layout.required_size_in_bytes()) + dense_view = StridedMemoryView.from_buffer(dense_buf, dense_layout) + with pytest.raises(ValueError, match="destination layout is non-unique"): + dense_view.copy_to(host_overlapping_view, d.default_stream, blocking=True) + with pytest.raises(ValueError, match="destination layout is non-unique"): + dense_view.copy_to(dev_overlapping_view, d.default_stream, blocking=True)