diff --git a/cuda_core/cuda/core/experimental/_layout.pxd b/cuda_core/cuda/core/experimental/_layout.pxd
index 301cdaaa65..1596a96af3 100644
--- a/cuda_core/cuda/core/experimental/_layout.pxd
+++ b/cuda_core/cuda/core/experimental/_layout.pxd
@@ -170,6 +170,12 @@ cdef class StridedLayout:
         _init_base_layout_from_tuple(base, shape, None)
         return self._init_dense(base, itemsize, order_flag, &stride_order_vec)
 
+    cdef inline StridedLayout copy(StridedLayout self):
+        cdef StridedLayout new_layout = StridedLayout.__new__(StridedLayout)
+        new_layout.init_from_ptr(self.base.ndim, self.base.shape, self.base.strides, self.itemsize)
+        new_layout.slice_offset = self.slice_offset
+        return new_layout
+
     # ==============================
     # Properties
     # ==============================
@@ -318,6 +324,18 @@ cdef class StridedLayout:
     # Layout manipulation
     # ==============================
 
+    cdef inline int make_dense(StridedLayout self, OrderFlag order_flag, axis_vec_t *stride_order_vec) except -1 nogil:
+        if order_flag == ORDER_C:
+            _dense_strides_c(self.base)
+        elif order_flag == ORDER_F:
+            _dense_strides_f(self.base)
+        elif order_flag == ORDER_PERM:
+            _dense_strides_in_order(self.base, deref(stride_order_vec))
+        else:
+            raise ValueError("The stride_order must be 'C', 'F', or a permutation.")
+        self.slice_offset = 0
+        self._prop_mask = 0
+        return 0
 
     cdef int reshape_into(StridedLayout self, StridedLayout out_layout, BaseLayout& new_shape) except -1 nogil
     cdef int permute_into(StridedLayout self, StridedLayout out_layout, axis_vec_t& axis_order) except -1 nogil
@@ -375,21 +393,32 @@ cdef inline stride_t *get_strides_ptr(BaseLayout& base) except? NULL nogil:
     return tmp_strides
 
 
-cdef inline bint _base_layout_equal(BaseLayout& a, BaseLayout& b) noexcept nogil:
+cdef inline bint base_equal_shapes(BaseLayout& a, BaseLayout& b) noexcept nogil:
     if a.ndim != b.ndim:
         return False
     for i in range(a.ndim):
         if a.shape[i] != b.shape[i]:
             return False
-    if a.strides != NULL or b.strides != NULL:
-        if a.strides == NULL or b.strides == NULL:
+    return True
+
+
+cdef inline bint _base_equal_strides(BaseLayout& a, BaseLayout& b) noexcept nogil:
+    if a.strides == NULL or b.strides == NULL:
+        return a.strides == b.strides
+    for i in range(a.ndim):
+        if a.strides[i] != b.strides[i]:
             return False
-        for i in range(a.ndim):
-            if a.strides[i] != b.strides[i]:
-                return False
     return True
 
 
+cdef inline bint base_equal_strides(BaseLayout& a, BaseLayout& b) noexcept nogil:
+    return a.ndim == b.ndim and _base_equal_strides(a, b)
+
+
+cdef inline bint base_layout_equal(BaseLayout& a, BaseLayout& b) noexcept nogil:
+    return base_equal_shapes(a, b) and _base_equal_strides(a, b)
+
+
 @cython.overflowcheck(True)
 cdef inline int64_t _volume(BaseLayout& base) except? -1 nogil:
     cdef int64_t vol = 1
diff --git a/cuda_core/cuda/core/experimental/_layout.pyx b/cuda_core/cuda/core/experimental/_layout.pyx
index 65efd4025d..adfc1662a9 100644
--- a/cuda_core/cuda/core/experimental/_layout.pyx
+++ b/cuda_core/cuda/core/experimental/_layout.pyx
@@ -177,7 +177,7 @@ cdef class StridedLayout:
             )
 
     def __eq__(self : StridedLayout, other : StridedLayout) -> bool:
-        return self.itemsize == other.itemsize and self.slice_offset == other.slice_offset and _base_layout_equal(self.base, other.base)
+        return self.itemsize == other.itemsize and self.slice_offset == other.slice_offset and base_layout_equal(self.base, other.base)
 
     @property
     def ndim(self : StridedLayout):
diff --git a/cuda_core/cuda/core/experimental/_memoryview.pyx b/cuda_core/cuda/core/experimental/_memoryview.pyx
index dc972d912a..b8118a45d4 100644
--- a/cuda_core/cuda/core/experimental/_memoryview.pyx
+++ b/cuda_core/cuda/core/experimental/_memoryview.pyx
@@ -5,7 +5,11 @@
 from ._dlpack cimport *
 from libc.stdint cimport intptr_t
 from cuda.core.experimental._layout cimport StridedLayout
-from cuda.core.experimental._stream import Stream
+from cuda.core.experimental._memory._buffer cimport Buffer
+from cuda.core.experimental._strided_copy._copy cimport copy_into_d2d, copy_into_d2h, copy_into_h2d
+from cuda.core.experimental._strided_copy._copy_utils cimport get_data_ptr
+from cuda.core.experimental._stream cimport Stream
+from cuda.core.experimental._strided_copy._copy import CopyAllocatorOptions
 
 import functools
 import warnings
@@ -15,9 +19,6 @@ import numpy
 
 from cuda.core.experimental._utils.cuda_utils import handle_return, driver
 
-
-from cuda.core.experimental._memory import Buffer
-
 # TODO(leofang): support NumPy structured dtypes
 
 
@@ -160,7 +161,7 @@ cdef class StridedMemoryView:
 
     @classmethod
     def from_buffer(
-        cls, buffer : Buffer, layout : StridedLayout,
+        cls, Buffer buffer, StridedLayout layout,
         dtype : numpy.dtype | None = None,
         is_readonly : bool = False
     ) -> StridedMemoryView:
@@ -218,7 +219,7 @@ cdef class StridedMemoryView:
             dlm_tensor.deleter(dlm_tensor)
 
     def view(
-        self, layout : StridedLayout | None = None, dtype : numpy.dtype | None = None
+        self, StridedLayout layout = None, object dtype = None
     ) -> StridedMemoryView:
         """
         Creates a new view with adjusted layout and dtype.
@@ -236,7 +237,7 @@ cdef class StridedMemoryView:
 
     def copy_from(
         self, other : StridedMemoryView, stream : Stream,
-        allocator = None,
+        allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None = None,
         blocking : bool | None = None,
     ):
         """
@@ -253,16 +254,19 @@ cdef class StridedMemoryView:
               if :attr:`dtype` is not specified).
             * The destination's layout must be unique (see :meth:`StridedLayout.is_unique`).
 
+        It is the user's responsibility to ensure proper current device is set
+        when calling this method.
+
         Parameters
         ----------
         other : StridedMemoryView
             The view to copy data from.
         stream : Stream | None, optional
             The stream to schedule the copy on.
-        allocator : MemoryResource | None, optional
-            If temporary buffers are needed, the specifed memory resources
-            will be used to allocate the memory. If not specified, default
-            resources will be used.
+        allocator : :obj:`~utils.CopyAllocatorOptions` | dict[str, :obj:`~_memory.MemoryResource`] | None, optional
+            If temporary buffers are needed, the specifed ``allocator.host`` and
+            ``allocator.device`` memory resources will be used to allocate the memory
+            on the host and device respectively.
         blocking : bool | None, optional
             Whether the call should block until the copy is complete.
                 * ``True``: the ``stream`` is synchronized with the host at the end of the call,
@@ -274,11 +278,11 @@ cdef class StridedMemoryView:
                     * for device-to-device, it defaults to ``False`` (non-blocking),
                     * for host-to-device or device-to-host, it defaults to ``True`` (blocking).
         """
-        raise NotImplementedError("Sorry, not supported: copy_from")
+        copy_into(self, other, stream, allocator, blocking)
 
     def copy_to(
-        self, other : StridedMemoryView, stream : Stream | None = None,
-        allocator = None,
+        self, other : StridedMemoryView, Stream stream,
+        allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None = None,
         blocking : bool | None = None,
     ):
         """
@@ -286,7 +290,7 @@ cdef class StridedMemoryView:
 
         For details, see :meth:`copy_from`.
         """
-        raise NotImplementedError("Sorry, not supported: copy_to")
+        copy_into(other, self, stream, allocator, blocking)
 
     @property
     def layout(self) -> StridedLayout:
@@ -342,7 +346,7 @@ cdef class StridedMemoryView:
                 raise ValueError("Cannot infer layout from the exporting object")
         return self._layout
 
-    cdef inline object get_buffer(self):
+    cdef inline Buffer get_buffer(self):
         """
         Returns Buffer instance with the underlying data.
         If the SMV was created from a Buffer, it will return the same Buffer instance.
@@ -661,13 +665,9 @@ cdef StridedLayout layout_from_cai(object metadata):
     return layout
 
 
-cdef inline intptr_t get_data_ptr(object buffer, StridedLayout layout) except? 0:
-    return <intptr_t>(int(buffer.handle)) + layout.get_slice_offset_in_bytes()
-
-
 cdef inline int view_buffer_strided(
     StridedMemoryView view,
-    object buffer,
+    Buffer buffer,
     StridedLayout layout,
     object dtype,
     bint is_readonly,
@@ -706,3 +706,54 @@ cdef inline int view_buffer_strided(
     view._layout = layout
     view._dtype = dtype
     return 0
+
+
+cdef int copy_into(
+    StridedMemoryView dst,
+    StridedMemoryView src,
+    Stream stream,
+    allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None = None,
+    blocking : bool | None = None,
+) except -1:
+    cdef object dst_dtype = dst.get_dtype()
+    cdef object src_dtype = src.get_dtype()
+    if dst_dtype is not None and src_dtype is not None and dst_dtype != src_dtype:
+        raise ValueError(
+            f"The destination and source dtypes must be the same, "
+            f"got {dst_dtype} and {src_dtype}."
+        )
+    if dst.readonly:
+        raise ValueError("The destination view is readonly.")
+    cdef bint is_src_device_accessible = src.is_device_accessible
+    cdef bint is_dst_device_accessible = dst.is_device_accessible
+    cdef bint is_blocking
+    cdef int device_id
+    cdef Buffer dst_buffer = dst.get_buffer()
+    cdef Buffer src_buffer = src.get_buffer()
+    cdef StridedLayout dst_layout = dst.get_layout()
+    cdef StridedLayout src_layout = src.get_layout()
+    if is_src_device_accessible and is_dst_device_accessible:
+        device_id = dst.device_id
+        if src.device_id != device_id:
+            raise ValueError(
+                f"The destination and source views must be on the "
+                f"same device, got {device_id} and {src.device_id}."
+            )
+        is_blocking = blocking if blocking is not None else False
+        copy_into_d2d(dst_buffer, dst_layout, src_buffer, src_layout, device_id, stream, is_blocking)
+        return 0
+    elif is_src_device_accessible:
+        device_id = src.device_id
+        is_blocking = blocking if blocking is not None else True
+        copy_into_d2h(dst_buffer, dst_layout, src_buffer, src_layout, device_id, stream, allocator, is_blocking)
+        return 0
+    elif is_dst_device_accessible:
+        device_id = dst.device_id
+        is_blocking = blocking if blocking is not None else True
+        copy_into_h2d(dst_buffer, dst_layout, src_buffer, src_layout, device_id, stream, allocator, is_blocking)
+        return 0
+    else:
+        raise ValueError(
+            "The host-to-host copy is not supported, "
+            "at least one of the views must be device-accessible."
+        )
diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_copy.pxd b/cuda_core/cuda/core/experimental/_strided_copy/_copy.pxd
new file mode 100644
index 0000000000..b2e945e4df
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_strided_copy/_copy.pxd
@@ -0,0 +1,44 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental._stream cimport Stream
+from cuda.core.experimental._layout cimport StridedLayout
+from cuda.core.experimental._memory._buffer cimport Buffer
+
+from cuda.core.experimental._memory import MemoryResource
+
+
+cdef int copy_into_d2d(
+    Buffer dst_buffer,
+    StridedLayout dst_layout,
+    Buffer src_buffer,
+    StridedLayout src_layout,
+    int device_id,
+    Stream stream,
+    bint blocking,
+) except -1
+
+
+cdef int copy_into_d2h(
+    Buffer dst_buffer,
+    StridedLayout dst_layout,
+    Buffer src_buffer,
+    StridedLayout src_layout,
+    int device_id,
+    Stream stream,
+    allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None,
+    bint blocking,
+) except -1
+
+
+cdef int copy_into_h2d(
+    Buffer dst_buffer,
+    StridedLayout dst_layout,
+    Buffer src_buffer,
+    StridedLayout src_layout,
+    int device_id,
+    Stream stream,
+    allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None,
+    bint blocking,
+) except -1
diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_copy.pyx b/cuda_core/cuda/core/experimental/_strided_copy/_copy.pyx
new file mode 100644
index 0000000000..736f89a715
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_strided_copy/_copy.pyx
@@ -0,0 +1,528 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+cimport cython
+
+from libc.stdint cimport intptr_t, int64_t
+
+from cuda.core.experimental._layout cimport axis_vec_t, ORDER_C
+from cuda.core.experimental._strided_copy._copy_utils cimport (
+    logging_axis_order, logging_memcopy, logging_py_objs, memcpy_async,
+    volume_in_bytes, maybe_sync, get_data_ptr,
+    flatten_together, maybe_broadcast_src,
+    check_itemsize, _view_as_numpy, _view_as_strided, _np_dtype
+)
+from cuda.core.experimental._strided_copy._cuda_kernel cimport cuda_kernel_copy
+
+import contextlib
+import threading
+from dataclasses import dataclass
+from cuda.core.experimental._device import Device
+from cuda.core.experimental._memory import MemoryResource, DeviceMemoryResource
+import numpy as _numpy
+
+
+_thread_local = threading.local()
+_current_logger = None
+
+
+@contextlib.contextmanager
+def _with_logger(logger):
+    # Utility meant for debugging and testing purposes.
+    global _current_logger
+    _current_logger = logger
+    yield
+    _current_logger = None
+
+
+@dataclass
+class CopyAllocatorOptions:
+    host : MemoryResource | None = None
+    device : DeviceMemoryResource | None = None
+
+
+cdef inline allocator_options(allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None):
+    if allocator is None or type(allocator) is CopyAllocatorOptions:
+        return allocator
+    return CopyAllocatorOptions(**allocator)
+
+
+cdef inline object _numpy_empty(
+    Buffer host_alloc,
+    StridedLayout layout,
+    object logger,
+):
+    """
+    The layout must be contiguous in some order (layout.get_is_contiguous_any() is True)
+    If host_alloc is not None, returns a numpy array being a view on the host_alloc with
+    shape, itemsize and strides as in the layout.
+    Otherwise, returns a new numpy array with shape, itemsize and strides as in the layout.
+    """
+    if host_alloc is not None:
+        return _view_as_numpy(int(host_alloc.handle), volume_in_bytes(layout), layout)
+    cdef object a = _numpy.empty(layout.get_volume(), dtype=_np_dtype(layout.itemsize))
+    return _view_as_strided(a, layout)
+
+
+cdef inline object _numpy_ascontiguousarray(
+    Buffer host_alloc,
+    intptr_t data_ptr,
+    int64_t size,
+    StridedLayout layout,
+    object logger,
+):
+    """
+    Returns a numpy array with the same shape and itemsize as the layout,
+    but C-contiguous strides. The data_ptr must be a valid host pointer to
+    a tensor described with the layout.
+    The layout is modified in place so that it is C-contiguous and dense.
+    If host_alloc is provided, copies the data there and returns a view on it.
+    Otherwise, returns a new numpy array.
+    """
+    cdef object a = _view_as_numpy(data_ptr, size, layout)
+    if logger is not None:
+        logger.debug(
+            f"({layout}) is not contiguous, coalescing H2H copy is needed."
+        )
+    layout.make_dense(ORDER_C, NULL)
+    if host_alloc is None:
+        return _numpy.ascontiguousarray(a)
+    cdef object b = _numpy_empty(host_alloc, layout, logger)
+    _numpy.copyto(b, a)
+    return b
+
+
+cdef inline get_device_default_mr(int device_id):
+    cdef dict device_default_mrs = getattr(_thread_local, "device_default_mrs", None)
+    if device_default_mrs is None:
+        device_default_mrs = {}
+        _thread_local.device_default_mrs = device_default_mrs
+    cdef mr = device_default_mrs.get(device_id)
+    if mr is not None:
+        return mr
+
+    # We're accessing the device's default mr for the first time
+    # We need to make sure the device's context has ever been set
+    cdef object current_dev = Device()
+    cdef int current_dev_id = current_dev.device_id
+    cdef object dev = Device(device_id)
+    try:
+        dev.set_current()
+        mr = dev.memory_resource
+        device_default_mrs[device_id] = mr
+        return mr
+    finally:
+        if current_dev_id != device_id:
+            current_dev.set_current()
+
+
+cdef inline Buffer _device_allocate(
+    device_allocator : DeviceMemoryResource | None,
+    int64_t size,
+    int device_id,
+    Stream stream,
+):
+    if device_allocator is None:
+        device_allocator = get_device_default_mr(device_id)
+    return device_allocator.allocate(size, stream)
+
+
+cdef inline int _copy_into_d2d(
+    intptr_t dst_ptr,
+    StridedLayout dst_layout,
+    intptr_t src_ptr,
+    StridedLayout src_layout,
+    int device_id,
+    intptr_t stream_ptr,
+    bint blocking,
+    object logger,
+) except -1 nogil:
+    # Note: this function assumes that layouts were squeezed
+    # already and can be modified in place (i.e. they are not
+    # referenced elsewhere, e.g. by StridedMemoryView).
+
+    cdef int64_t size = volume_in_bytes(dst_layout)
+    if size == 0:
+        return 0
+
+    # Normalize the layouts:
+    # 1. permute the layouts so that the dst layouts order is C-like.
+    # 2. flatten extents that are mergable in both layouts
+    cdef axis_vec_t axis_order
+    dst_layout.get_stride_order(axis_order)
+    dst_layout.permute_into(dst_layout, axis_order)
+    src_layout.permute_into(src_layout, axis_order)
+    flatten_together(dst_layout, src_layout)
+    if logger is not None:
+        logging_axis_order("The dst_order is {fst}", logger, axis_order)
+        logging_py_objs("Normalized layouts: dst {fst}, src {snd}", logger, dst_layout, src_layout)
+
+    # Can we just memcpy?
+    # We permuted the extents to C-order, so if layouts are dense, they are C-contiguous.
+    # The precondition is that the shapes are equal, thus the c-contiguity implies equal strides.
+    if dst_layout.get_is_contiguous_c() and src_layout.get_is_contiguous_c():
+        if logger is not None:
+            logging_memcopy(
+                "The layouts are contiguous and have the same stride order, we can memcopy and return.\n{msg}",
+                logger, "D2D",
+                dst_ptr, dst_layout,
+                src_ptr, src_layout,
+                blocking, size, stream_ptr
+            )
+        memcpy_async(dst_ptr, src_ptr, size, stream_ptr)
+        maybe_sync(stream_ptr, blocking, logger)
+        return 0
+
+    if not dst_layout.get_is_unique():
+        raise ValueError(
+            f"The destination layout is non-unique, i.e. some elements "
+            f"may overlap in memory: {dst_layout}"
+        )
+    cuda_kernel_copy(dst_ptr, dst_layout, src_ptr, src_layout, device_id, stream_ptr, logger)
+    maybe_sync(stream_ptr, blocking, logger)
+    return 0
+
+
+cdef inline int _copy_into_h2d(
+    intptr_t dst_data_ptr,
+    StridedLayout dst_layout,
+    intptr_t src_data_ptr,
+    StridedLayout src_layout,
+    int device_id,
+    Stream stream,
+    bint blocking,
+    host_allocator : MemoryResource | None,
+    device_allocator : DeviceMemoryResource | None,
+    object logger,
+) except -1:
+    """
+    Copies data from host to device, rearranging the data if needed.
+    In a simplest case of contiguous layouts with matching stride orders,
+    only H2D memcpy is needed.
+    Otherwise, up to 2 extra copies (H2H and D2D) may be needed
+    for 3 different reasons:
+    * if src is non-contiguous we need a H2H 'coalescing' copy
+    * if dst is non-contiguous we need a D2D 'scattering' copy
+    * if the dst and src's stride orders differ, we need
+      a either H2H or D2D copy to transpose the data.
+    Moving data around in the device memory should be faster,
+    so we prefer D2D for transpose-copy, unless coalescing
+    H2H is needed anyway while D2D for scattering can be avoided.
+    """
+
+    # Note, this function assumes that layouts were squeezed
+    # already and can be modified in place (i.e. they are not
+    # referenced elsewhere, e.g. by StridedMemoryView).
+
+    cdef intptr_t stream_ptr = <intptr_t>stream._handle
+    cdef int64_t size = volume_in_bytes(dst_layout)
+    if size == 0:
+        return 0
+
+    # Permute the layouts so that src has C-like strides
+    # (increasing from the right to the left)
+    cdef axis_vec_t axis_order
+    src_layout.get_stride_order(axis_order)
+    dst_layout.permute_into(dst_layout, axis_order)
+    src_layout.permute_into(src_layout, axis_order)
+
+    # First, make sure the src is contiguous, so that we can memcpy from it.
+    cdef Buffer host_alloc = None
+    if host_allocator is not None:
+        host_alloc = host_allocator.allocate(size, stream)
+    # A numpy array (either view on host_alloc or a new regular numpy array)
+    cdef object src_tmp = None
+    if not src_layout.get_is_contiguous_c():
+        if dst_layout.get_is_contiguous_any():
+            # We cannot avoid H2H copy, but we can avoid D2D copy
+            # if we also make sure that the src and dst stride orders match.
+            # We permute layouts to dst order, so that dst is C-contig
+            # and the src will be made C-contig by the H2H copy.
+            dst_layout.get_stride_order(axis_order)
+            dst_layout.permute_into(dst_layout, axis_order)
+            src_layout.permute_into(src_layout, axis_order)
+        # the host allocation may not be stream-aware (in particular, the default numpy is not!)
+        # so we need to block at least until the H2D is complete to avoid deallocating too early.
+        blocking = True
+        src_tmp = _numpy_ascontiguousarray(host_alloc, src_data_ptr, size, src_layout, logger)
+        src_data_ptr = src_tmp.ctypes.data
+
+    if dst_layout.get_is_contiguous_c():
+        # We made sure that src layout is C-contig too, we can just memcopy.
+        if logger is not None:
+            logging_memcopy(
+                "The layouts are contiguous and have the same stride order, we can memcopy and return.\n{msg}",
+                logger, "H2D",
+                dst_data_ptr, dst_layout,
+                src_data_ptr, src_layout,
+                blocking, size, stream_ptr
+            )
+        with cython.nogil:
+            memcpy_async(dst_data_ptr, src_data_ptr, size, stream_ptr)
+            maybe_sync(stream_ptr, blocking, logger)
+        return 0
+
+    # Otherwise, either dst is not contiguous or src has a different stride order than dst.
+    # In either case, src is contiguous in some order, so we can just memcopy
+    # it to a temporary buffer and then perform a D2D transpose/scatter copy to dst.
+    cdef Buffer dev_tmp = _device_allocate(device_allocator, size, device_id, stream)
+    cdef intptr_t dev_tmp_data_ptr = int(dev_tmp.handle)
+    if logger is not None:
+        logging_memcopy(
+            f"First, memcpy into a temporary device buffer:\n{{msg}}\n"
+            f"Followed by scatter/transpose D2D copy:\n"
+            f"({dst_data_ptr}, {dst_layout} <- {dev_tmp_data_ptr}, {src_layout})",
+            logger, "H2D",
+            dev_tmp_data_ptr, src_layout,
+            src_data_ptr, src_layout,
+            blocking, size, stream_ptr
+        )
+    with cython.nogil:
+        memcpy_async(dev_tmp_data_ptr, src_data_ptr, size, stream_ptr)
+        _copy_into_d2d(
+            dst_data_ptr, dst_layout,
+            dev_tmp_data_ptr, src_layout,
+            device_id, stream_ptr,
+            blocking, logger
+        )
+    return 0
+
+
+cdef inline int _copy_into_d2h(
+    intptr_t dst_data_ptr,
+    StridedLayout dst_layout,
+    intptr_t src_data_ptr,
+    StridedLayout src_layout,
+    int device_id,
+    Stream stream,
+    bint blocking,
+    host_allocator : MemoryResource | None,
+    device_allocator : DeviceMemoryResource | None,
+    object logger,
+) except -1:
+    """
+    Copies data from device to host, rearranging the data if needed.
+    In a simplest case of contiguous layouts with matching stride orders,
+    only D2H memcpy is needed.
+    Otherwise, up to 2 extra copies (D2D and H2H) may be needed:
+    * if src is non-contiguous we need a D2D 'coalescing' copy
+    * if dst is non-contiguous we need a H2H 'scattering' copy
+    * if the dst and src's stride orders differ, we need
+      a either D2D or H2H copy to transpose the data.
+    Moving data around in the device memory should be faster,
+    so we prefer D2D for transpose-copy, unless H2H is needed anyway
+    for scattering while D2D can be avoided.
+    """
+
+    # Note, this function assumes that layouts were squeezed
+    # already and can be modified in place (i.e. they are not
+    # referenced elsewhere, e.g. by StridedMemoryView).
+
+    cdef intptr_t stream_ptr = <intptr_t>stream._handle
+    cdef int64_t size = volume_in_bytes(dst_layout)
+    if size == 0:
+        return 0
+
+    if not dst_layout.get_is_unique():
+        raise ValueError(
+            f"The destination layout is non-unique, i.e. some elements "
+            f"may overlap in memory: {dst_layout}"
+        )
+
+    # Permute the layouts so that dst has C-like strides
+    # (increasing from the right to the left)
+    cdef axis_vec_t axis_order
+    dst_layout.get_stride_order(axis_order)
+    dst_layout.permute_into(dst_layout, axis_order)
+    src_layout.permute_into(src_layout, axis_order)
+
+    # If, after the permutation to C-like order, dst is still not C-contig,
+    # we need to H2H scatter the data.
+    cdef bint is_dst_contig = dst_layout.get_is_contiguous_c()
+
+    cdef Buffer src_tmp = None
+    cdef intptr_t src_tmp_data_ptr
+    cdef StridedLayout src_tmp_layout
+    if (
+        # if dst does not require scattering H2H copy,
+        # run D2D whenever src needs transposing or coalescing
+        (is_dst_contig and not src_layout.get_is_contiguous_c())
+        # otherwise, as H2H is needed anyway,
+        # run D2D only if src requires coalescing
+        or not src_layout.get_is_contiguous_any()
+    ):
+        # After the copy, the src will be coalesced and
+        # transposed to dst order.
+        src_tmp_layout = src_layout.to_dense("C")
+        src_tmp = _device_allocate(device_allocator, size, device_id, stream)
+        src_tmp_data_ptr = int(src_tmp.handle)
+        if logger is not None:
+            logger.debug(
+                f"We need to coalesce or transpose the data, "
+                f"running D2D copy into a temporary device buffer:\n"
+                f"({src_tmp_data_ptr}, {src_tmp_layout} <- {src_data_ptr}, {src_layout})"
+            )
+        _copy_into_d2d(
+            src_tmp_data_ptr,
+            # pass a copy of the dense layout, we'll
+            # need it later and d2d can modify it in place
+            src_tmp_layout.copy(),
+            src_data_ptr, src_layout,
+            device_id, stream_ptr,
+            False, logger
+        )
+        src_data_ptr = src_tmp_data_ptr
+        src_layout = src_tmp_layout
+
+    if is_dst_contig:
+        # The dst is c-contig. If we run a D2D copy, it made src C-contig too.
+        # If we didn't run a D2D copy, src must have been C-contig already.
+        if logger is not None:
+            logging_memcopy(
+                "The layouts are contiguous and have the same stride order, we can memcopy and return.\n{msg}",
+                logger, "D2H",
+                dst_data_ptr, dst_layout,
+                src_data_ptr, src_layout,
+                blocking, size, stream_ptr
+            )
+        with cython.nogil:
+            memcpy_async(dst_data_ptr, src_data_ptr, size, stream_ptr)
+            maybe_sync(stream_ptr, blocking, logger)
+        return 0
+
+    # Otherwise, we need to D2H copy into a temp host buffer and run
+    # a H2H copy to scatter or transpose the data.
+    cdef Buffer host_alloc = None
+    if host_allocator is not None:
+        host_alloc = host_allocator.allocate(size, stream)
+    # A numpy array (either view on host_alloc or a new regular numpy array)
+    cdef object dst_tmp = _numpy_empty(host_alloc, src_layout, logger)
+    cdef intptr_t dst_tmp_data_ptr = dst_tmp.ctypes.data
+    if logger is not None:
+        logging_memcopy(
+            f"First memcpy into a temporary host buffer:\n{{msg}}\n"
+            f"Followed by scatter/transpose H2H copy:\n"
+            f"({dst_data_ptr}, {dst_layout} <- {dst_tmp_data_ptr}, {src_layout})",
+            logger, "D2H",
+            dst_data_ptr, src_layout,
+            src_data_ptr, src_layout,
+            blocking, size, stream_ptr
+        )
+    with cython.nogil:
+        memcpy_async(dst_tmp_data_ptr, src_data_ptr, size, stream_ptr)
+        maybe_sync(stream_ptr, True, logger)
+    _numpy.copyto(_view_as_numpy(dst_data_ptr, size, dst_layout), dst_tmp)
+    return 0
+
+
+cdef int copy_into_d2d(
+    Buffer dst_buffer,
+    StridedLayout dst_layout,
+    Buffer src_buffer,
+    StridedLayout src_layout,
+    int device_id,
+    Stream stream,
+    bint blocking,
+) except -1:
+    check_itemsize(dst_layout, src_layout)
+    src_layout = maybe_broadcast_src(dst_layout, src_layout)
+
+    cdef intptr_t stream_ptr = <intptr_t>stream._handle
+    cdef intptr_t dst_data_ptr = get_data_ptr(dst_buffer, dst_layout)
+    cdef intptr_t src_data_ptr = get_data_ptr(src_buffer, src_layout)
+
+    # Get rid of all 1-extents, as their strides are irrelevant.
+    # Make sure the copy of the layouts is passed, as the _copy_into_d2d
+    # may modify those
+    cdef StridedLayout squeezed_dst = StridedLayout.__new__(StridedLayout)
+    cdef StridedLayout squeezed_src = StridedLayout.__new__(StridedLayout)
+    dst_layout.squeeze_into(squeezed_dst)
+    src_layout.squeeze_into(squeezed_src)
+
+    return _copy_into_d2d(
+        dst_data_ptr, squeezed_dst,
+        src_data_ptr, squeezed_src,
+        device_id, stream_ptr,
+        blocking, _current_logger
+    )
+
+
+cdef int copy_into_h2d(
+    Buffer dst_buffer,
+    StridedLayout dst_layout,
+    Buffer src_buffer,
+    StridedLayout src_layout,
+    int device_id,
+    Stream stream,
+    allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None,
+    bint blocking,
+) except -1:
+    check_itemsize(dst_layout, src_layout)
+    src_layout = maybe_broadcast_src(dst_layout, src_layout)
+
+    cdef intptr_t dst_data_ptr = get_data_ptr(dst_buffer, dst_layout)
+    cdef intptr_t src_data_ptr = get_data_ptr(src_buffer, src_layout)
+    cdef host_allocator = None
+    cdef device_allocator = None
+    allocator = allocator_options(allocator)
+    if allocator is not None:
+        host_allocator = allocator.host
+        device_allocator = allocator.device
+
+    # Get rid of all 1-extents, as their strides are irrelevant.
+    # Make sure the copy of the layouts is passed, as the _copy_into_h2d
+    # may modify those
+    cdef StridedLayout squeezed_dst = StridedLayout.__new__(StridedLayout)
+    cdef StridedLayout squeezed_src = StridedLayout.__new__(StridedLayout)
+    dst_layout.squeeze_into(squeezed_dst)
+    src_layout.squeeze_into(squeezed_src)
+
+    return _copy_into_h2d(
+        dst_data_ptr, squeezed_dst,
+        src_data_ptr, squeezed_src,
+        device_id, stream,
+        blocking, host_allocator, device_allocator,
+        _current_logger,
+    )
+
+
+cdef int copy_into_d2h(
+    Buffer dst_buffer,
+    StridedLayout dst_layout,
+    Buffer src_buffer,
+    StridedLayout src_layout,
+    int device_id,
+    Stream stream,
+    allocator : CopyAllocatorOptions | dict[str, MemoryResource] | None,
+    bint blocking,
+) except -1:
+    check_itemsize(dst_layout, src_layout)
+    src_layout = maybe_broadcast_src(dst_layout, src_layout)
+
+    cdef intptr_t dst_data_ptr = get_data_ptr(dst_buffer, dst_layout)
+    cdef intptr_t src_data_ptr = get_data_ptr(src_buffer, src_layout)
+    cdef host_allocator = None
+    cdef device_allocator = None
+    allocator = allocator_options(allocator)
+    if allocator is not None:
+        host_allocator = allocator.host
+        device_allocator = allocator.device
+
+    # Get rid of all 1-extents, as their strides are irrelevant.
+    # Make sure the copy of the layouts is passed, as the _copy_into_d2h
+    # may modify those
+    cdef StridedLayout squeezed_dst = StridedLayout.__new__(StridedLayout)
+    cdef StridedLayout squeezed_src = StridedLayout.__new__(StridedLayout)
+    dst_layout.squeeze_into(squeezed_dst)
+    src_layout.squeeze_into(squeezed_src)
+
+    return _copy_into_d2h(
+        dst_data_ptr, squeezed_dst,
+        src_data_ptr, squeezed_src,
+        device_id, stream,
+        blocking,
+        host_allocator, device_allocator,
+        _current_logger
+    )
diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_copy_utils.pxd b/cuda_core/cuda/core/experimental/_strided_copy/_copy_utils.pxd
new file mode 100644
index 0000000000..102642bf08
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_strided_copy/_copy_utils.pxd
@@ -0,0 +1,167 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cimport cython
+
+cimport cpython
+from cpython.memoryview cimport PyMemoryView_FromMemory
+
+from libc.stdint cimport intptr_t, int64_t
+
+from cuda.core.experimental._memory._buffer cimport Buffer
+from cuda.core.experimental._layout cimport (
+    StridedLayout, BaseLayout, axis_vec_t, axes_mask_t,
+    init_base_layout, _overflow_checked_mul, base_equal_shapes
+)
+from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+
+from cuda.bindings cimport cydriver
+from cuda.bindings.cydriver cimport CUstream, CUdeviceptr
+
+import numpy as _numpy
+
+
+cdef inline int64_t div_ceil(int64_t a, int64_t b) except? -1 nogil:
+    return (a + b - 1) // b
+
+
+cdef inline intptr_t get_data_ptr(Buffer buffer, StridedLayout layout) except? 0:
+    return <intptr_t>(int(buffer.handle)) + layout.get_slice_offset_in_bytes()
+
+
+cdef inline object _np_dtype(int itemsize):
+    if itemsize == 1:
+        return _numpy.uint8
+    elif itemsize == 2:
+        return _numpy.uint16
+    elif itemsize == 4:
+        return _numpy.uint32
+    elif itemsize == 8:
+        return _numpy.uint64
+    elif itemsize == 16:
+        return _numpy.complex128
+    else:
+        raise ValueError(f"Unsupported itemsize: {itemsize}")
+
+
+cdef inline _view_as_strided(object array, StridedLayout layout):
+    """
+    Array must be a 1d numpy array with dtype.itemsize == layout.itemsize.
+    """
+    cdef tuple strides = layout.get_strides_in_bytes_tuple()
+    if strides is None:
+        return array.reshape(layout.get_shape_tuple(), order='C')
+    else:
+        return _numpy.lib.stride_tricks.as_strided(
+            array,
+            shape=layout.get_shape_tuple(),
+            strides=strides
+        )
+
+
+cdef inline object _view_as_numpy(intptr_t data_ptr, int64_t size, StridedLayout layout):
+    """
+    Note the returned array is non-owning, it's caller responsibility to keep buffer alive.
+    """
+    cdef object buf = PyMemoryView_FromMemory(<char*>data_ptr, size, cpython.PyBUF_WRITE)
+    cdef object array = _numpy.frombuffer(buf, dtype=_np_dtype(layout.itemsize))
+    return _view_as_strided(array, layout)
+
+
+cdef inline int flatten_together(StridedLayout a, StridedLayout b) except -1 nogil:
+    cdef axes_mask_t axis_mask = a.get_flattened_axis_mask() & b.get_flattened_axis_mask()
+    a.flatten_into(a, axis_mask)
+    b.flatten_into(b, axis_mask)
+    return 0
+
+
+cdef inline bint vectorize_together(intptr_t dst_ptr, StridedLayout dst, intptr_t src_ptr, StridedLayout src) except -1 nogil:
+    cdef int max_itemsize = 8
+    cdef int itemsize = dst.itemsize
+    if itemsize >= max_itemsize:
+        return False
+    cdef int new_itemsize = dst.get_max_compatible_itemsize(max_itemsize, dst_ptr)
+    if itemsize >= new_itemsize:
+        return False
+    new_itemsize = src.get_max_compatible_itemsize(new_itemsize, src_ptr)
+    if itemsize >= new_itemsize:
+        return False
+    dst.pack_into(dst, new_itemsize, 0, keep_dim=False)
+    src.pack_into(src, new_itemsize, 0, keep_dim=False)
+    return True
+
+
+cdef inline int64_t volume_in_bytes(StridedLayout layout) except? -1 nogil:
+    return _overflow_checked_mul(layout.get_volume(), <int64_t>layout.itemsize)
+
+
+cdef inline int check_itemsize(StridedLayout dst, StridedLayout src) except -1 nogil:
+    if dst.itemsize != src.itemsize:
+        raise ValueError(
+            f"The itemsize of the destination and source layouts must match. "
+            f"Got dst itemsize:{dst.itemsize} and src itemsize:{src.itemsize}"
+        )
+    return 0
+
+
+cdef inline StridedLayout maybe_broadcast_src(StridedLayout dst, StridedLayout src):
+    if base_equal_shapes(dst.base, src.base):
+        return src
+    # If the shapes differ, try broadcasting the source layout to the destination layout.
+    cdef StridedLayout new_src = StridedLayout.__new__(StridedLayout)
+    cdef BaseLayout new_src_base
+    cdef int dst_ndim = dst.base.ndim
+    init_base_layout(new_src_base, dst_ndim)
+    for i in range(dst_ndim):
+        new_src_base.shape[i] = dst.base.shape[i]
+    src.broadcast_into(new_src, new_src_base)
+    return new_src
+
+
+cdef inline int logging_axis_order(str msg, logger, axis_vec_t& fst) except -1 nogil:
+    with cython.gil:
+        logger.debug(msg.format(fst=fst))
+    return 0
+
+
+cdef inline int logging_memcopy(
+    str msg, object logger, str kind,
+    intptr_t dst_ptr, StridedLayout dst,
+    intptr_t src_ptr, StridedLayout src,
+    bint blocking, int64_t size, intptr_t stream_ptr
+) except -1 nogil:
+    with cython.gil:
+        logger.debug(msg.format(msg=(
+            f"Launching {kind} {'blocking' if blocking else 'non-blocking'} memcpy of "
+            f"{size} bytes on stream {stream_ptr}.\n"
+            f"Dst: {dst_ptr}, {dst} <- src: {src_ptr}, {src}"
+        )))
+    return 0
+
+
+cdef inline int logging_py_objs(str msg, logger, fst=None, snd=None, third=None) except -1 nogil:
+    with cython.gil:
+        logger.debug(msg.format(fst=fst, snd=snd, third=third))
+    return 0
+
+
+cdef inline int memcpy_async(intptr_t dst_ptr, intptr_t src_ptr, size_t size, intptr_t stream_ptr) except -1 nogil:
+    HANDLE_RETURN(
+        cydriver.cuMemcpyAsync(
+            <CUdeviceptr>dst_ptr,
+            <CUdeviceptr>src_ptr,
+            size,
+            <CUstream>stream_ptr
+        )
+    )
+    return 0
+
+
+cdef inline int maybe_sync(intptr_t stream_ptr, bint blocking, object logger) except -1 nogil:
+    if blocking:
+        if logger is not None:
+            with cython.gil:
+                logger.debug(f"Syncing stream {stream_ptr}.")
+        HANDLE_RETURN(cydriver.cuStreamSynchronize(<CUstream>stream_ptr))
+    return 0
diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_cuda_kernel.pxd b/cuda_core/cuda/core/experimental/_strided_copy/_cuda_kernel.pxd
new file mode 100644
index 0000000000..a033007b75
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_strided_copy/_cuda_kernel.pxd
@@ -0,0 +1,20 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cimport cython
+
+from libc.stdint cimport intptr_t
+
+from cuda.core.experimental._layout cimport StridedLayout
+
+
+cdef int cuda_kernel_copy(
+    intptr_t dst_ptr,
+    StridedLayout dst,
+    intptr_t src_ptr,
+    StridedLayout src,
+    int device_id,
+    intptr_t stream_ptr,
+    object logger,
+) except -1 nogil
diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_cuda_kernel.pyx b/cuda_core/cuda/core/experimental/_strided_copy/_cuda_kernel.pyx
new file mode 100644
index 0000000000..93307c0985
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_strided_copy/_cuda_kernel.pyx
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+cimport cython
+
+from libc.stdint cimport intptr_t, int64_t
+from libcpp.memory cimport unique_ptr
+from libcpp.functional cimport function
+
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._utils.cuda_utils cimport HANDLE_RETURN
+from cuda.core.experimental._layout cimport StridedLayout, axes_mask_t, flatten_all_axes_mask, get_strides_ptr
+from cuda.core.experimental._strided_copy._jit cimport get_kernel
+from cuda.core.experimental._strided_copy._copy_utils cimport logging_py_objs, div_ceil, vectorize_together
+
+ctypedef unique_ptr[void, function[void(void*)]] opaque_args_t
+
+
+cdef extern from "limits.h":
+    cdef int INT_MAX
+    cdef int INT_MIN
+
+
+cdef extern from "include/strided_copy_utils.hpp":
+    void _get_strided_copy_args(
+        opaque_args_t& args,
+        void *dst_ptr, const void *src_ptr,
+        int dst_ndim, int src_ndim,
+        int64_t* dst_shape, int64_t* src_shape,
+        int64_t* dst_strides, int64_t* src_strides,
+        int64_t grid_arg) except + nogil
+
+
+cdef inline int get_kernel_args(
+    opaque_args_t& args,
+    intptr_t dst_ptr, StridedLayout dst,
+    intptr_t src_ptr, StridedLayout src,
+    int64_t grid_arg
+) except-1 nogil:
+    _get_strided_copy_args(
+        args,
+        <void*>dst_ptr, <const void*>src_ptr,
+        dst.base.ndim, src.base.ndim,
+        dst.base.shape, src.base.shape,
+        get_strides_ptr(dst.base), get_strides_ptr(src.base),
+        grid_arg
+    )
+    return 0
+
+
+cdef inline bint needs_wide_strides(int64_t grid_volume, StridedLayout dst, StridedLayout src) except?-1 nogil:
+    # grid_volume, i.e the block_size * num_blocks
+    if grid_volume > INT_MAX:
+        return True
+    cdef int64_t dst_min_offset = 0
+    cdef int64_t dst_max_offset = 0
+    cdef int64_t src_min_offset = 0
+    cdef int64_t src_max_offset = 0
+    dst.get_offset_bounds(dst_min_offset, dst_max_offset)
+    src.get_offset_bounds(src_min_offset, src_max_offset)
+    cdef int64_t min_offset = min(dst_min_offset, src_min_offset)
+    cdef int64_t max_offset = max(dst_max_offset, src_max_offset)
+    # forbid INT_MIN too so that:
+    # 1. abs() is safe
+    # 2. the INT_MIN can be used as special-value/sentinel in the kernel
+    return min_offset <= INT_MIN or max_offset > INT_MAX
+
+
+cdef str emit_elementwise_kernel_code(StridedLayout dst, StridedLayout src, bint has_wide_strides, bint has_grid_stride_loop):
+    cdef str stride_t_str = "int64_t" if has_wide_strides else "int32_t"
+    cdef str has_grid_stride_loop_str = "true" if has_grid_stride_loop else "false"
+    kernel_code = f"""
+    #include "elementwise.h"
+    ELEMENTWISE_KERNEL({stride_t_str}, {dst.base.ndim}, {src.base.ndim}, {dst.itemsize}, {has_grid_stride_loop_str})
+    """
+    return kernel_code
+
+
+cdef inline intptr_t get_elementwise_copy_kernel(
+    StridedLayout dst, StridedLayout src,
+    bint has_wide_strides, bint has_grid_stride_loop,
+    int device_id, object logger
+) except? 0:
+    cdef str kernel_code = emit_elementwise_kernel_code(dst, src, has_wide_strides, has_grid_stride_loop)
+    cdef intptr_t kernel_ptr = get_kernel(kernel_code, device_id, logger)
+    return kernel_ptr
+
+
+cdef inline int adjust_layouts_for_elementwise_copy(StridedLayout dst, StridedLayout src, object logger) except -1 nogil:
+    # We want the layouts to keep the same shapes, so that, in cuda kernel,
+    # we have to unravel flat element index only once.
+    # The exception is if one of the layouts is flattened to 1D,
+    # as those don't require unraveling.
+    cdef int ndim = dst.base.ndim
+    if ndim == 1:
+        return 0
+    cdef axes_mask_t all_extents = flatten_all_axes_mask(ndim)
+    cdef axes_mask_t dst_mask = dst.get_flattened_axis_mask()
+    cdef axes_mask_t src_mask = src.get_flattened_axis_mask()
+    if dst_mask == all_extents or src_mask == all_extents:
+        if dst_mask == all_extents:
+            dst.flatten_into(dst, dst_mask)
+        if src_mask == all_extents:
+            src.flatten_into(src, src_mask)
+        if logger is not None:
+            logging_py_objs("At least one of the layouts is flattened to 1D: dst {fst}, src {snd}", logger, dst, src)
+    return 0
+
+
+cdef int launch_elementwise_copy(
+    intptr_t dst_ptr, StridedLayout dst,
+    intptr_t src_ptr, StridedLayout src,
+    int block_size, int device_id,
+    intptr_t stream_ptr, object logger
+) except -1 nogil:
+    cdef int64_t volume = dst.get_volume()
+    cdef int64_t num_logical_blocks = div_ceil(volume, block_size)
+    cdef int64_t cuda_num_blocks = min(num_logical_blocks, INT_MAX)
+    cdef bint has_grid_stride_loop = cuda_num_blocks != num_logical_blocks
+    cdef bint has_wide_strides = needs_wide_strides(num_logical_blocks * block_size, dst, src)
+    cdef opaque_args_t args
+    get_kernel_args(args, dst_ptr, dst, src_ptr, src, volume)
+    cdef void* args_ptr = args.get()
+    cdef intptr_t kernel_fn_ptr
+    with cython.gil:
+        kernel_fn_ptr = get_elementwise_copy_kernel(dst, src, has_wide_strides, has_grid_stride_loop, device_id, logger)
+        if logger is not None:
+            logger.debug(
+                f"Launching elementwise copy kernel {kernel_fn_ptr} "
+                f"with grid.x={cuda_num_blocks}, block.x={block_size}."
+            )
+    HANDLE_RETURN(cydriver.cuLaunchKernel(
+        <cydriver.CUfunction>kernel_fn_ptr,
+        cuda_num_blocks, 1, 1,
+        block_size, 1, 1,
+        0,  # shared_mem_size
+        <cydriver.CUstream>stream_ptr,
+        &args_ptr,
+        NULL
+    ))
+    return 0
+
+
+cdef inline int elementwise_copy(
+    intptr_t dst_ptr, StridedLayout dst,
+    intptr_t src_ptr, StridedLayout src,
+    int device_id, intptr_t stream_ptr, object logger
+) except -1 nogil:
+    cdef int block_size = 128
+    adjust_layouts_for_elementwise_copy(dst, src, logger)
+    launch_elementwise_copy(dst_ptr, dst, src_ptr, src, block_size, device_id, stream_ptr, logger)
+    return 0
+
+
+cdef int cuda_kernel_copy(
+    intptr_t dst_ptr,
+    StridedLayout dst,
+    intptr_t src_ptr,
+    StridedLayout src,
+    int device_id,
+    intptr_t stream_ptr,
+    object logger,
+) except -1 nogil:
+    # the dst and layouts must be already validated and normalized, i.e.:
+    # * the shapes must be equal
+    # * the dst stride order must be C-like
+    # * implicit C-strides are not allowed (i.e. the strides must not be NULL)
+    # * the volume should be >= 2
+    # * there should not be any extents equal to 1
+    # * the layouts should be flattened together
+    if vectorize_together(dst_ptr, dst, src_ptr, src) and logger is not None:
+        logging_py_objs("Vectorized the layouts: dst {fst}, src {snd}", logger, dst, src)
+    elementwise_copy(dst_ptr, dst, src_ptr, src, device_id, stream_ptr, logger)
+    return 0
diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_jit.pxd b/cuda_core/cuda/core/experimental/_strided_copy/_jit.pxd
new file mode 100644
index 0000000000..e8940d8dec
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_strided_copy/_jit.pxd
@@ -0,0 +1,7 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+
+cdef intptr_t get_kernel(str kernel_code, int device_id, object logger) except? 0
diff --git a/cuda_core/cuda/core/experimental/_strided_copy/_jit.pyx b/cuda_core/cuda/core/experimental/_strided_copy/_jit.pyx
new file mode 100644
index 0000000000..fd46da4095
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_strided_copy/_jit.pyx
@@ -0,0 +1,108 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from libc.stdint cimport intptr_t
+
+import os
+import threading
+
+from cuda.core.experimental._device import Device
+from cuda.core.experimental._program import Program, ProgramOptions
+
+
+_tls = threading.local()
+
+# In multithreaded environment we share the compiled and loaded modules between threads.
+# Each thread has its own cache mapping arch -> kernel_code_str -> Kernel ptr,
+# on a cache miss, we first take a look into the shared cache guarded with _kernel_lock
+# and eventually compile if needed.
+_kernel_lock = threading.Lock()
+_kernel_cache = {}  # arch -> kernel_code_str -> Kernel
+
+
+cpdef str get_strided_copy_include_dir(object logger):
+    """
+    Finds and caches the absolute path for the strided copy includes.
+    """
+    # TODO(ktokarski) Once Program API supports passing includes as strings and names,
+    # read all the headers once and cache them.
+    cdef str strided_copy_include_dir = getattr(_tls, "strided_copy_include_dir", None)
+    if strided_copy_include_dir is not None:
+        return strided_copy_include_dir
+    cdef str current_dir = os.path.dirname(os.path.abspath(__file__))
+    cdef str copy_kernel_dir = os.path.normpath(os.path.join(current_dir, os.pardir, "include", "strided_copy"))
+    _tls.strided_copy_include_dir = copy_kernel_dir
+    if logger is not None:
+        logger.debug(f"Strided copy include dir: {copy_kernel_dir}")
+    return copy_kernel_dir
+
+
+cdef inline str get_device_arch(int device_id):
+    # device_id -> arch
+    cdef dict device_ccs = getattr(_tls, "device_ccs", None)
+    if device_ccs is None:
+        device_ccs = {}
+        _tls.device_ccs = device_ccs
+    cdef str arch = device_ccs.get(device_id)
+    if arch is None:
+        arch = f"sm_{Device(device_id).arch}"
+        device_ccs[device_id] = arch
+    return arch
+
+
+cdef compile_load_kernel(str kernel_code, str arch, object logger):
+    cdef str include_dir = get_strided_copy_include_dir(logger)
+    cdef options = ProgramOptions(arch=arch, include_path=include_dir)
+    cdef program = Program(kernel_code, code_type="c++", options=options)
+    cdef object_code = program.compile("cubin")
+    cdef kernel = object_code.get_kernel("execute")
+    return kernel
+
+
+cdef inline intptr_t _get_or_compile_kernel(str kernel_code, str arch, object logger) except? 0:
+    cdef dict cc_cache = _kernel_cache.get(arch)
+    if cc_cache is None:
+        cc_cache = {}
+        _kernel_cache[arch] = cc_cache
+
+    cdef kernel_obj = cc_cache.get(kernel_code)
+    if kernel_obj is None:
+        kernel_obj = compile_load_kernel(kernel_code, arch, logger)
+        cc_cache[kernel_code] = kernel_obj
+        if logger is not None:
+            logger.debug(f"Stored kernel ({kernel_obj}) (arch={arch}) in global cache.\n{kernel_code}")
+    elif logger is not None:
+        logger.debug(f"Loaded kernel ({kernel_obj}) (arch={arch}) from global cache.\n{kernel_code}")
+    return int(kernel_obj._handle)
+
+
+cdef inline intptr_t get_or_compile_kernel(str kernel_code, str arch, object logger) except? 0:
+    with _kernel_lock:
+        return _get_or_compile_kernel(kernel_code, arch, logger)
+
+
+cdef intptr_t get_kernel(str kernel_code, int device_id, object logger) except? 0:
+    """
+    Returns a pointer to the kernel function for a given kernel code and device id.
+
+    In multithreaded environment, each thread has its own cache with pointers to the loaded
+    modules, if the cache is not populated, the shared cache guarded with _kernel_lock is used.
+    """
+    cdef str arch = get_device_arch(device_id)
+    cdef dict local_kernel_cache = getattr(_tls, "local_kernel_cache", None)
+    if local_kernel_cache is None:
+        local_kernel_cache = {}
+        _tls.local_kernel_cache = local_kernel_cache
+    cdef dict local_cc_cache = local_kernel_cache.get(arch)
+    if local_cc_cache is None:
+        local_cc_cache = {}
+        local_kernel_cache[arch] = local_cc_cache
+
+    cdef kernel_ptr = local_cc_cache.get(kernel_code)
+    if kernel_ptr is None:
+        kernel_ptr = get_or_compile_kernel(kernel_code, arch, logger)
+        local_cc_cache[kernel_code] = kernel_ptr
+    elif logger is not None:
+        logger.debug(f"Loaded kernel ({kernel_ptr}) for device {device_id=} ({arch=}) from thread local cache.\n{kernel_code}")
+    return kernel_ptr
diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/args.h b/cuda_core/cuda/core/experimental/include/strided_copy/args.h
new file mode 100644
index 0000000000..64605f106b
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/include/strided_copy/args.h
@@ -0,0 +1,37 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+//
+// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+#ifndef CUDA_CORE_STRIDED_COPY_ARGS_H
+#define CUDA_CORE_STRIDED_COPY_ARGS_H
+
+#include "impl/type_utils.h"
+
+#if defined(_MSC_VER)
+// For Visual Studio, use __restrict
+#define RESTRICT __restrict
+#elif defined(__GNUC__) || defined(__clang__)
+// For GCC and Clang, use __restrict__
+#define RESTRICT __restrict__
+#else
+// Fallback for other compilers, or if restrict is not supported
+#define RESTRICT
+#endif
+
+namespace cuda_core
+{
+template <int N>
+struct KernelArgs
+{
+    void *RESTRICT dst_ptr;
+    const void *RESTRICT src_ptr;
+    int64_t dst_shape[N];
+    int64_t src_shape[N];
+    int64_t dst_strides[N];
+    int64_t src_strides[N];
+    int64_t grid_arg;
+};
+} // namespace cuda_core
+
+#endif // CUDA_CORE_STRIDED_COPY_ARGS_H
diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/elementwise.h b/cuda_core/cuda/core/experimental/include/strided_copy/elementwise.h
new file mode 100644
index 0000000000..0578934378
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/include/strided_copy/elementwise.h
@@ -0,0 +1,63 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+//
+// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+#ifndef CUDA_CORE_STRIDED_COPY_ELEMENTWISE_H
+#define CUDA_CORE_STRIDED_COPY_ELEMENTWISE_H
+
+#include "args.h"
+#include "impl/array_view.h"
+#include "impl/elementwise.h"
+#include "impl/type_utils.h"
+#include "impl/utils.h"
+#include "impl/vec.h"
+
+#define ELEMENTWISE_KERNEL(stride_t, dst_ndim, src_ndim, itemsize,                 \
+                           needs_grid_stride_loop)                                 \
+    extern "C"                                                                     \
+    {                                                                              \
+        constexpr int max_ndim = dst_ndim > src_ndim ? dst_ndim : src_ndim;        \
+        void __global__ execute(const cuda_core::KernelArgs<max_ndim> args)        \
+        {                                                                          \
+            cuda_core::elementwise_copy<cuda_core::##stride_t, dst_ndim, src_ndim, \
+                                        itemsize, needs_grid_stride_loop>          \
+                kernel;                                                            \
+            kernel(args);                                                          \
+        }                                                                          \
+    }
+
+namespace cuda_core
+{
+
+template <typename stride_t, int dst_ndim, int src_ndim, int itemsize,
+          int needs_grid_stride_loop>
+struct elementwise_copy
+{
+    using dtype_t = opaque_t<itemsize>;
+    using dst_coords_t = vec<dst_ndim, stride_t>;
+    using src_coords_t = vec<src_ndim, stride_t>;
+    using dst_array_view_t = array_view<dtype_t, dst_coords_t>;
+    using src_array_view_t = array_view<const dtype_t, src_coords_t>;
+    using grid_indexer_t = element_indexer<stride_t, needs_grid_stride_loop>;
+    constexpr static bool has_equal_shapes = dst_ndim == src_ndim;
+    constexpr static int ndim = dst_ndim > src_ndim ? dst_ndim : src_ndim;
+
+    DEV void operator()(const KernelArgs<ndim> args) const
+    {
+        dst_array_view_t dst_array_view{static_cast<dtype_t *>(args.dst_ptr),
+                                        dst_coords_t{args.dst_shape},
+                                        dst_coords_t{args.dst_strides}};
+        src_array_view_t src_array_view{static_cast<const dtype_t *>(args.src_ptr),
+                                        src_coords_t{args.src_shape},
+                                        src_coords_t{args.src_strides}};
+        auto kernel = elementwise_copy_impl<has_equal_shapes, dst_array_view_t,
+                                            src_array_view_t, grid_indexer_t>{};
+        kernel(std::move(dst_array_view), std::move(src_array_view),
+               grid_indexer_t{static_cast<stride_t>(args.grid_arg)});
+    }
+};
+
+} // namespace cuda_core
+
+#endif // CUDA_CORE_STRIDED_COPY_ELEMENTWISE_H
diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/impl/array_view.h b/cuda_core/cuda/core/experimental/include/strided_copy/impl/array_view.h
new file mode 100644
index 0000000000..b0e2cef8d1
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/include/strided_copy/impl/array_view.h
@@ -0,0 +1,63 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+//
+// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+#ifndef CUDA_CORE_STRIDED_COPY_IMPL_ARRAY_VIEW_H_
+#define CUDA_CORE_STRIDED_COPY_IMPL_ARRAY_VIEW_H_
+
+#include "utils.h"
+#include "vec.h"
+
+#if defined(_MSC_VER)
+// For Visual Studio, use __restrict
+#define RESTRICT __restrict
+#elif defined(__GNUC__) || defined(__clang__)
+// For GCC and Clang, use __restrict__
+#define RESTRICT __restrict__
+#else
+// Fallback for other compilers, or if restrict is not supported
+#define RESTRICT
+#endif
+
+namespace cuda_core
+{
+
+template <typename T, typename _coords_t>
+struct array_view
+{
+    // While indices cannot be negative (only strides can),
+    // we're using the same 32- or 64-bit signed type to represent both
+    // indices and strides for simplicity. In the end we need to convert
+    // both to the same signed type when computing the offset.
+    using coords_t = _coords_t;
+    using stride_t = typename coords_t::type;
+    using dtype_t = T;
+    static constexpr int ndim = coords_t::ndim;
+
+    HOST_DEV constexpr array_view(T *__restrict__ data, coords_t shape,
+                                  coords_t strides)
+        : shape_(shape), strides_(strides), data_(data) {}
+
+    HOST_DEV T &operator[](const coords_t idx) const
+    {
+        return data_[offset(idx)];
+    }
+    HOST_DEV T &operator[](const stride_t offset) const { return data_[offset]; }
+    HOST_DEV stride_t offset(const coords_t idx) const
+    {
+        return dot(idx, strides());
+    }
+    HOST_DEV coords_t shape() const { return shape_; }
+    HOST_DEV coords_t strides() const { return strides_; }
+    HOST_DEV T *data() const { return data_; }
+
+protected:
+    coords_t shape_;
+    coords_t strides_;
+    T *RESTRICT data_;
+};
+
+} // namespace cuda_core
+
+#endif // CUDA_CORE_STRIDED_COPY_IMPL_ARRAY_VIEW_H_
diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/impl/elementwise.h b/cuda_core/cuda/core/experimental/include/strided_copy/impl/elementwise.h
new file mode 100644
index 0000000000..a0f8d49c02
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/include/strided_copy/impl/elementwise.h
@@ -0,0 +1,128 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+//
+// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+#ifndef CUDA_CORE_STRIDED_COPY_IMPL_ELEMENTWISE_H
+#define CUDA_CORE_STRIDED_COPY_IMPL_ELEMENTWISE_H
+
+#include "array_view.h"
+#include "type_utils.h"
+#include "utils.h"
+#include "vec.h"
+
+namespace cuda_core
+{
+
+namespace detail
+{
+
+template <typename coords_t, typename stride_t = typename coords_t::type>
+DEV coords_t unravel_idx(const stride_t flat_idx, const coords_t shape)
+{
+    constexpr int ndim = coords_t::ndim;
+    if constexpr (ndim <= 0)
+    {
+        return {};
+    }
+    else if constexpr (ndim == 1)
+    {
+        return {flat_idx};
+    }
+    else if constexpr (ndim > 1)
+    {
+
+        // the extents cannot be negative and the arithmetic on unsigned integer
+        // is noticeably faster
+        using u_stride_t = typename type_traits::unsign<stride_t>::type;
+        u_stride_t u_flat_idx = flat_idx;
+        coords_t unraveled_coords;
+#pragma unroll
+        for (int i = ndim - 1; i >= 1; i--)
+        {
+            u_stride_t extent = shape[i];
+            if (extent & (extent - 1))
+            {
+                u_stride_t next_flat_idx = u_flat_idx / extent;
+                unraveled_coords[i] = u_flat_idx - next_flat_idx * extent;
+                u_flat_idx = next_flat_idx;
+            }
+            else
+            {
+                unraveled_coords[i] = u_flat_idx & (extent - 1);
+                u_flat_idx >>= ffs(extent) - 1;
+            }
+        }
+        unraveled_coords[0] = u_flat_idx;
+        return unraveled_coords;
+    }
+}
+
+} // namespace detail
+
+template <typename stride_t, bool _needs_grid_stride_loop>
+struct element_indexer
+{
+    // stride_t can be 32-bit integer for tensor_volume and gridDim * blockDim up
+    // to INT_MAX, this way unsigned x < INT_MAX; x += INT_MAX cannot overflow
+    using ustride_t = typename type_traits::unsign<stride_t>::type;
+    static constexpr bool needs_grid_stride_loop = _needs_grid_stride_loop;
+
+    constexpr HOST_DEV element_indexer(const stride_t tensor_volume)
+        : tensor_volume(tensor_volume) {}
+
+    template <typename Cb>
+    DEV void with_grid_stride_loop(Cb &&cb) const
+    {
+        // early cast the special indexing variables to the desired integer width
+        // type to avoid arithmetic on 32-bit integers when 64-bit stride_t is used
+        const ustride_t thread_idx = threadIdx.x;
+        const ustride_t block_idx = blockIdx.x;
+        const ustride_t block_dim = blockDim.x;
+        if constexpr (!needs_grid_stride_loop)
+        {
+            const ustride_t x = block_idx * block_dim + thread_idx;
+            if (x < tensor_volume)
+            {
+                cb(x);
+            }
+        }
+        else if constexpr (needs_grid_stride_loop)
+        {
+            const ustride_t grid_dim = gridDim.x;
+            const ustride_t grid_size = grid_dim * block_dim;
+            for (ustride_t x = block_idx * block_dim + thread_idx; x < tensor_volume;
+                 x += grid_size)
+            {
+                cb(x);
+            }
+        }
+    }
+
+    ustride_t tensor_volume;
+};
+
+template <bool has_equal_shapes, typename dst_array_view_t,
+          typename src_array_view_t, typename grid_indexer_t>
+struct elementwise_copy_impl
+{
+    using stride_t = typename dst_array_view_t::stride_t;
+
+    DEV void operator()(const dst_array_view_t &&dst_view,
+                        const src_array_view_t &&src_view,
+                        const grid_indexer_t &&grid_helper)
+    {
+        grid_helper.with_grid_stride_loop([=](const stride_t flat_element_idx)
+                                          {
+      const auto dst_coords =
+          detail::unravel_idx(flat_element_idx, dst_view.shape());
+      const auto src_coords =
+          cond_val(bconst<has_equal_shapes>(), dst_coords,
+                   detail::unravel_idx(flat_element_idx, src_view.shape()));
+      dst_view[dst_coords] = src_view[src_coords]; });
+    }
+};
+
+} // namespace cuda_core
+
+#endif // CUDA_CORE_STRIDED_COPY_IMPL_ELEMENTWISE_H
diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/impl/type_utils.h b/cuda_core/cuda/core/experimental/include/strided_copy/impl/type_utils.h
new file mode 100644
index 0000000000..72d0b58c1e
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/include/strided_copy/impl/type_utils.h
@@ -0,0 +1,47 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+//
+// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+#ifndef CUDA_CORE_STRIDED_COPY_IMPL_TYPE_UTILS_H_
+#define CUDA_CORE_STRIDED_COPY_IMPL_TYPE_UTILS_H_
+
+namespace cuda_core
+{
+using int32_t = int;
+using uint32_t = unsigned int;
+using int64_t = long long int;
+using uint64_t = unsigned long long int;
+static_assert(sizeof(int32_t) == 4, "int32_t must be 4 bytes");
+static_assert(sizeof(uint32_t) == 4, "uint32_t must be 4 bytes");
+static_assert(sizeof(int64_t) == 8, "int64_t must be 8 bytes");
+static_assert(sizeof(uint64_t) == 8, "uint64_t must be 8 bytes");
+
+// Use a struct to represent type of element so that we don't rely
+// on actual representation of the type, available arithmetic etc.
+template <int n_bytes>
+struct alignas(n_bytes) opaque_t
+{
+    char data[n_bytes];
+};
+
+static_assert(sizeof(opaque_t<1>) == 1, "opaque_t<1> must be 1 byte");
+static_assert(sizeof(opaque_t<2>) == 2, "opaque_t<2> must be 2 bytes");
+static_assert(sizeof(opaque_t<4>) == 4, "opaque_t<4> must be 4 bytes");
+static_assert(sizeof(opaque_t<8>) == 8, "opaque_t<8> must be 8 bytes");
+static_assert(sizeof(opaque_t<16>) == 16, "opaque_t<16> must be 16 bytes");
+
+static_assert(alignof(opaque_t<1>) == alignof(unsigned char),
+              "opaque_t<1> must be 1 byte");
+static_assert(alignof(opaque_t<2>) == alignof(unsigned short),
+              "opaque_t<2> must be 2 bytes");
+static_assert(alignof(opaque_t<4>) == alignof(unsigned int),
+              "opaque_t<4> must be 4 bytes");
+static_assert(alignof(opaque_t<8>) == alignof(unsigned long long int),
+              "opaque_t<8> must be 8 bytes");
+#ifdef __CUDA_ARCH__
+static_assert(alignof(opaque_t<16>) == 16, "opaque_t<16> must be 16 bytes");
+#endif
+} // namespace cuda_core
+
+#endif // CUDA_CORE_STRIDED_COPY_IMPL_TYPE_UTILS_H_
diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/impl/utils.h b/cuda_core/cuda/core/experimental/include/strided_copy/impl/utils.h
new file mode 100644
index 0000000000..a43bc72f44
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/include/strided_copy/impl/utils.h
@@ -0,0 +1,180 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+//
+// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+#ifndef CUDA_CORE_STRIDED_COPY_IMPL_UTILS_H_
+#define CUDA_CORE_STRIDED_COPY_IMPL_UTILS_H_
+
+#include "type_utils.h"
+
+#if defined(__CUDACC__)
+#define HOST_DEV __host__ __device__ __forceinline__
+#define DEV __device__ __forceinline__
+#else
+#define HOST_DEV
+#define DEV
+#endif
+
+namespace cuda_core
+{
+
+// Some of stl type traits are not available with nvrtc
+namespace type_traits
+{
+template <bool B, class T, class F>
+struct conditional
+{
+};
+
+template <class T, class F>
+struct conditional<true, T, F>
+{
+    using type = T;
+};
+
+template <class T, class F>
+struct conditional<false, T, F>
+{
+    using type = F;
+};
+
+template <bool B, class T = void>
+struct enable_if
+{
+};
+template <class T>
+struct enable_if<true, T>
+{
+    typedef T type;
+};
+
+template <typename T>
+struct unsign
+{
+};
+
+template <>
+struct unsign<int64_t>
+{
+    using type = uint64_t;
+};
+
+template <>
+struct unsign<int32_t>
+{
+    using type = uint32_t;
+};
+
+template <typename T>
+struct is_32_or_64_int
+{
+    static constexpr bool value = false;
+};
+
+template <>
+struct is_32_or_64_int<int32_t>
+{
+    static constexpr bool value = true;
+};
+
+template <>
+struct is_32_or_64_int<int64_t>
+{
+    static constexpr bool value = true;
+};
+
+template <typename T>
+struct min_val
+{
+};
+
+template <>
+struct min_val<int32_t>
+{
+    static constexpr int32_t value = -2147483648;
+};
+
+template <>
+struct min_val<int64_t>
+{
+    static constexpr int64_t value = -9223372036854775808LL;
+};
+} // namespace type_traits
+
+template <typename T, T val>
+struct const_val
+{
+    using type = T;
+    static constexpr T value = val;
+};
+
+template <int N>
+using iconst = const_val<int, N>;
+
+template <bool B>
+using bconst = const_val<bool, B>;
+
+template <typename true_val_t, typename false_val_t>
+HOST_DEV auto constexpr cond_val(bconst<false>, true_val_t &&true_val,
+                                 false_val_t &&false_val)
+{
+    return false_val;
+}
+
+template <typename true_val_t, typename false_val_t>
+HOST_DEV auto constexpr cond_val(bconst<true>, true_val_t &&true_val,
+                                 false_val_t &&false_val)
+{
+    return true_val;
+}
+
+#if defined(__CUDACC__)
+
+DEV int ffs(uint32_t x) { return __ffs(x); }
+
+DEV int ffs(int32_t x) { return __ffs(x); }
+
+DEV int ffs(uint64_t x) { return __ffsll(x); }
+
+DEV int ffs(int64_t x) { return __ffsll(x); }
+
+#endif
+
+HOST_DEV constexpr int log2_floor(const int k)
+{
+    return k == 1 ? 0 : 1 + log2_floor(k >> 1);
+}
+
+template <int k>
+struct mod_div
+{
+    static_assert(k > 0, "k must be positive");
+    static_assert((k & (k - 1)) == 0, "k must be a power of 2");
+    static constexpr int value = k;
+    static constexpr int log2 = log2_floor(k);
+    static constexpr int mask = k - 1;
+    HOST_DEV constexpr int operator()() const { return k; }
+};
+
+template <typename T, int k>
+HOST_DEV constexpr T operator/(const T a, const mod_div<k>)
+{
+    return a >> mod_div<k>::log2;
+}
+
+template <typename T, int k>
+HOST_DEV constexpr T operator%(const T a, const mod_div<k>)
+{
+    return a & mod_div<k>::mask;
+}
+
+template <typename T, int k>
+HOST_DEV constexpr T operator*(const T a, const mod_div<k>)
+{
+    return a << mod_div<k>::log2;
+}
+
+} // namespace cuda_core
+
+#endif // CUDA_CORE_STRIDED_COPY_IMPL_UTILS_H_
diff --git a/cuda_core/cuda/core/experimental/include/strided_copy/impl/vec.h b/cuda_core/cuda/core/experimental/include/strided_copy/impl/vec.h
new file mode 100644
index 0000000000..1a75e42d2c
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/include/strided_copy/impl/vec.h
@@ -0,0 +1,212 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES.
+// All rights reserved.
+//
+// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+#ifndef CUDA_CORE_STRIDED_COPY_IMPL_VEC_H_
+#define CUDA_CORE_STRIDED_COPY_IMPL_VEC_H_
+
+#include "utils.h"
+
+namespace cuda_core
+{
+
+template <typename T, int N>
+struct vec_base
+{
+    T v[N];
+
+    template <typename... Components,
+              typename = typename type_traits::enable_if<
+                  sizeof...(Components) <= N &&
+                  (type_traits::is_32_or_64_int<Components>::value && ...)>::type>
+    HOST_DEV constexpr vec_base(Components... components) : v{T(components)...} {}
+
+    template <typename U, typename = typename type_traits::enable_if<
+                              type_traits::is_32_or_64_int<U>::value>::type>
+    HOST_DEV constexpr vec_base(const U *ptr)
+    {
+        for (int i = 0; i < N; i++)
+        {
+            v[i] = ptr[i];
+        }
+    }
+
+    HOST_DEV constexpr T &operator[](int i) { return v[i]; }
+    HOST_DEV constexpr const T &operator[](int i) const { return v[i]; }
+};
+
+template <typename T>
+struct vec_base<T, 0>
+{
+};
+
+template <int N, typename T>
+struct vec : vec_base<T, N>
+{
+    using base_t = vec_base<T, N>;
+    using type = T;
+    constexpr static int ndim = N;
+
+    constexpr vec() = default;
+
+    template <typename... Components>
+    HOST_DEV constexpr vec(Components... components) : base_t{components...} {}
+
+    template <typename U>
+    HOST_DEV constexpr vec(const U *ptr, int ndim) : base_t(ptr, ndim) {}
+
+    HOST_DEV constexpr int size() const { return ndim; }
+
+    template <int K>
+    HOST_DEV constexpr auto last(const iconst<K>) const
+    {
+        static_assert(K <= ndim);
+        return slice(iconst<ndim - K>(), iconst<ndim>());
+    }
+
+    template <int K>
+    HOST_DEV constexpr auto first(const iconst<K>) const
+    {
+        static_assert(K <= ndim);
+        return slice(iconst<0>(), iconst<K>());
+    }
+
+    template <int start, int end>
+    HOST_DEV constexpr vec<end - start, T> slice(const iconst<start>,
+                                                 const iconst<end>) const
+    {
+        static_assert(start >= 0 && end <= ndim);
+        constexpr int slice_ndim = end - start;
+        static_assert(slice_ndim >= 0);
+        if constexpr (slice_ndim != 0)
+        {
+            vec<slice_ndim, T> result;
+#pragma unroll
+            for (int i = 0; i < slice_ndim; i++)
+            {
+                result[i] = this->operator[](start + i);
+            }
+            return result;
+        }
+        return {};
+    }
+};
+
+template <int N, int M, typename T>
+HOST_DEV constexpr vec<N + M, T> cat(const vec<N, T> a, const vec<M, T> b)
+{
+    constexpr int ndim = N + M;
+    if constexpr (ndim != 0)
+    {
+        vec<ndim, T> result;
+        if constexpr (N > 0)
+        {
+#pragma unroll
+            for (int i = 0; i < N; i++)
+            {
+                result[i] = a[i];
+            }
+        }
+        if constexpr (M > 0)
+        {
+#pragma unroll
+            for (int i = 0; i < M; i++)
+            {
+                result[N + i] = b[i];
+            }
+        }
+        return result;
+    }
+    return {};
+}
+
+template <int N, typename T, typename Op>
+HOST_DEV constexpr auto vector_bin_op(const vec<N, T> a, const vec<N, T> b,
+                                      Op &&op)
+{
+    static_assert(N > 0, "N must be positive");
+    using result_t = decltype(op(a[0], b[0]));
+    vec<N, result_t> result;
+#pragma unroll
+    for (int i = 0; i < N; i++)
+    {
+        result[i] = op(a[i], b[i]);
+    }
+    return result;
+}
+
+template <int N, typename T>
+HOST_DEV constexpr auto operator+(const vec<N, T> a, const vec<N, T> b)
+{
+    return vector_bin_op(a, b, [](T a, T b)
+                         { return a + b; });
+}
+
+template <int N, typename T>
+HOST_DEV constexpr auto operator*(const vec<N, T> a, const vec<N, T> b)
+{
+    return vector_bin_op(a, b, [](T a, T b)
+                         { return a * b; });
+}
+
+template <int N, typename T>
+HOST_DEV constexpr auto operator-(const vec<N, T> a, const vec<N, T> b)
+{
+    return vector_bin_op(a, b, [](T a, T b)
+                         { return a - b; });
+}
+
+template <int N, typename T>
+HOST_DEV constexpr auto operator/(const vec<N, T> a, const vec<N, T> b)
+{
+    return vector_bin_op(a, b, [](T a, T b)
+                         { return a / b; });
+}
+
+template <int N, typename Pred, typename T, typename... Ts>
+HOST_DEV constexpr bool any(Pred &&pred, const vec<N, T> a,
+                            const vec<N, Ts>... vs)
+{
+    for (int i = 0; i < N; i++)
+    {
+        if (pred(a[i], vs[i]...))
+            return true;
+    }
+    return false;
+}
+
+template <int N, typename Pred, typename T, typename... Ts>
+HOST_DEV constexpr bool all(Pred &&pred, const vec<N, T> a,
+                            const vec<N, Ts>... vs)
+{
+    for (int i = 0; i < N; i++)
+    {
+        if (!pred(a[i], vs[i]...))
+            return false;
+    }
+    return true;
+}
+
+template <int N, typename T>
+HOST_DEV constexpr T dot(const vec<N, T> a, const vec<N, T> b)
+{
+    if constexpr (N == 0)
+    {
+        return 0;
+    }
+    else if constexpr (N != 0)
+    {
+        T sum = a[0] * b[0];
+#pragma unroll
+        for (int i = 1; i < N; i++)
+        {
+            sum += a[i] * b[i];
+        }
+        return sum;
+    }
+}
+
+} // namespace cuda_core
+
+#endif // CUDA_CORE_STRIDED_COPY_IMPL_VEC_H_
diff --git a/cuda_core/cuda/core/experimental/include/strided_copy_utils.hpp b/cuda_core/cuda/core/experimental/include/strided_copy_utils.hpp
new file mode 100644
index 0000000000..758952cb82
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/include/strided_copy_utils.hpp
@@ -0,0 +1,85 @@
+// SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+//
+// SPDX-License-Identifier: LicenseRef-NVIDIA-SOFTWARE-LICENSE
+
+#ifndef CUDA_CORE_STRIDED_COPY_UTILS_HPP
+#define CUDA_CORE_STRIDED_COPY_UTILS_HPP
+
+#include <cmath>
+#include <functional>
+#include <memory>
+#include <stdexcept>
+#include <type_traits>
+
+#include "layout.hpp"
+#include "strided_copy/args.h"
+
+template <int N>
+void _get_strided_copy_args_ndim(
+    std::unique_ptr<void, std::function<void(void *)>> &args,
+    void *dst_ptr, const void *src_ptr,
+    int dst_ndim, int src_ndim,
+    int64_t *dst_shape, int64_t *src_shape,
+    int64_t *dst_strides, int64_t *src_strides,
+    int64_t grid_arg)
+{
+    using uptr_t = std::unique_ptr<cuda_core::KernelArgs<N>, std::function<void(void *)>>;
+    uptr_t ptr{new cuda_core::KernelArgs<N>, [](void *p)
+               { delete (static_cast<cuda_core::KernelArgs<N> *>(p)); }};
+    ptr->dst_ptr = dst_ptr;
+    ptr->src_ptr = src_ptr;
+    for (int i = 0; i < dst_ndim; i++)
+    {
+        ptr->dst_shape[i] = dst_shape[i];
+        ptr->dst_strides[i] = dst_strides[i];
+    }
+    for (int i = 0; i < src_ndim; i++)
+    {
+        ptr->src_shape[i] = src_shape[i];
+        ptr->src_strides[i] = src_strides[i];
+    }
+    ptr->grid_arg = grid_arg;
+    args = std::move(ptr);
+}
+
+template <typename Cb, int i = 1, int max_ndim = STRIDED_LAYOUT_MAX_NDIM>
+void _call_with_static_ndim(int ndim, Cb &&cb)
+{
+    if constexpr (i > max_ndim)
+    {
+        throw std::runtime_error("unsupported ndim");
+    }
+    else if constexpr (i <= max_ndim)
+    {
+        if (ndim == i)
+        {
+            cb(std::integral_constant<int, i>());
+        }
+        else
+        {
+            _call_with_static_ndim<Cb, i + 1, max_ndim>(ndim, std::move(cb));
+        }
+    }
+}
+
+void inline _get_strided_copy_args(
+    std::unique_ptr<void, std::function<void(void *)>> &args,
+    void *dst_ptr, const void *src_ptr,
+    int dst_ndim, int src_ndim,
+    int64_t *dst_shape, int64_t *src_shape,
+    int64_t *dst_strides, int64_t *src_strides,
+    int64_t grid_arg)
+{
+    int max_ndim = dst_ndim >= src_ndim ? dst_ndim : src_ndim;
+    _call_with_static_ndim(max_ndim, [&](auto static_ndim_holder)
+                           {
+        constexpr int static_ndim = decltype(static_ndim_holder)::value;
+        _get_strided_copy_args_ndim<static_ndim>(
+            args, dst_ptr, src_ptr,
+            dst_ndim, src_ndim,
+            dst_shape, src_shape,
+            dst_strides, src_strides,
+            grid_arg); });
+}
+
+#endif // CUDA_CORE_STRIDED_COPY_UTILS_HPP
diff --git a/cuda_core/cuda/core/experimental/utils.py b/cuda_core/cuda/core/experimental/utils.py
index 3227f1eae1..daeecec849 100644
--- a/cuda_core/cuda/core/experimental/utils.py
+++ b/cuda_core/cuda/core/experimental/utils.py
@@ -1,9 +1,9 @@
 # SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 #
 # SPDX-License-Identifier: Apache-2.0
-
 from cuda.core.experimental._layout import StridedLayout  # noqa: F401
 from cuda.core.experimental._memoryview import (
     StridedMemoryView,  # noqa: F401
     args_viewable_as_strided_memory,  # noqa: F401
 )
+from cuda.core.experimental._strided_copy._copy import CopyAllocatorOptions  # noqa: F401
diff --git a/cuda_core/pyproject.toml b/cuda_core/pyproject.toml
index af99ddd361..d08701397a 100644
--- a/cuda_core/pyproject.toml
+++ b/cuda_core/pyproject.toml
@@ -69,7 +69,7 @@ issues = "https://github.com/NVIDIA/cuda-python/issues/"
 include = ["cuda.core*"]
 
 [tool.setuptools.package-data]
-"cuda.core.experimental.include" = ["*.h", "*.hpp", "*.cuh"]
+"cuda.core.experimental.include" = ["**/*.h", "**/*.hpp", "**/*.cuh"]
 
 [tool.setuptools.dynamic]
 version = { attr = "cuda.core._version.__version__" }
diff --git a/cuda_core/tests/test_strided_memory_view_copy.py b/cuda_core/tests/test_strided_memory_view_copy.py
new file mode 100644
index 0000000000..347d8f22a9
--- /dev/null
+++ b/cuda_core/tests/test_strided_memory_view_copy.py
@@ -0,0 +1,639 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+
+import ctypes
+import logging
+import random
+from contextlib import contextmanager
+from enum import Enum
+from io import StringIO
+
+import cuda.core.experimental as ccx
+import numpy as np
+import pytest
+from cuda.core.experimental import Buffer
+from cuda.core.experimental import (
+    system as ccx_system,
+)
+from cuda.core.experimental._strided_copy._copy import _with_logger
+from cuda.core.experimental.utils import (
+    CopyAllocatorOptions,
+    StridedLayout,
+    StridedMemoryView,
+)
+
+try:
+    import cupy as cp
+except ImportError:
+    cp = None
+
+from helpers.layout import (
+    NamedParam,
+    dtype_from_itemsize,
+    inv_permutation,
+    permuted,
+    pretty_name,
+)
+
+
+class CopyDirection(Enum):
+    D2D = "d2d"
+    D2H = "d2h"
+    H2D = "h2d"
+
+
+class SrcFormat(Enum):
+    C_CONTIGUOUS = "c_contiguous"  # the src is contigious in some order
+    F_CONTIGUOUS = "f_contiguous"  # the src is contigious in some order
+    SLICED = "sliced"  # the src is sliced (has gaps)
+
+
+class Transpose(Enum):
+    SAME_ORDER = False  # the src and dst have the same stride order
+    INVERSE = "inverse"  # the src and dst have the inverse stride order (e.g. F <-> C)
+    PERMUTATION = "permutation"  # the src and dst have permuted stride orders
+
+
+class DstFormat(Enum):
+    DENSE = "dense"  # the dst is contigious in some order
+    SLICED = "sliced"  # the dst is sliced (has gaps)
+
+
+class Broadcast(Enum):
+    NO = False
+    RIGHT = "right"
+    LEFT = "left"
+    RIGHT_LEFT = "right_left"
+
+
+class CustomAllocator(Enum):
+    HOST = "host"
+    DEVICE = "device"
+    BOTH = "host_device"
+
+
+_ITEMSIZES = [1, 2, 4, 8, 16]
+
+py_rng = random.Random(43)
+num_devices = ccx_system.num_devices
+
+
+def get_ptr(array):
+    if isinstance(array, np.ndarray):
+        return array.ctypes.data
+    elif isinstance(array, cp.ndarray):
+        return array.data.ptr
+    else:
+        raise ValueError(f"Invalid array: {type(array)}")
+
+
+@contextmanager
+def with_ccx_device(device_id):
+    current_dev = ccx.Device()
+    current_dev_id = current_dev.device_id
+    dev = ccx.Device(device_id)
+    try:
+        dev.set_current()
+        yield dev
+    finally:
+        if current_dev_id != device_id:
+            current_dev.set_current()
+
+
+_log_stream = StringIO()
+_logger = logging.Logger("smv_copy_test", level=logging.DEBUG)
+_logger.addHandler(logging.StreamHandler(_log_stream))
+_logger.setLevel(logging.DEBUG)
+
+
+@contextmanager
+def with_logger():
+    try:
+        with _with_logger(_logger):
+            yield _log_stream
+    finally:
+        _log_stream.truncate(0)
+
+
+def get_src_order(src_format):
+    assert isinstance(src_format, SrcFormat)
+    if src_format == SrcFormat.F_CONTIGUOUS:
+        return "F"
+    return "C"
+
+
+def get_dst_order(rng, src_format, transpose, shape):
+    assert isinstance(transpose, Transpose)
+    src_order = get_src_order(src_format)
+    if transpose == Transpose.SAME_ORDER:
+        return src_order
+    elif transpose == Transpose.INVERSE:
+        return "F" if src_order == "C" else "C"
+    else:
+        assert transpose == Transpose.PERMUTATION
+        perm = list(range(len(shape)))
+        rng.shuffle(perm)
+        return tuple(perm)
+
+
+def get_src_shape(base_src_shape, broadcast):
+    assert isinstance(broadcast, Broadcast)
+    match broadcast:
+        case Broadcast.NO | Broadcast.LEFT:
+            return base_src_shape
+        case Broadcast.RIGHT | Broadcast.RIGHT_LEFT:
+            return base_src_shape + (1,)
+    raise ValueError(f"Invalid broadcast: {broadcast}")
+
+
+def get_dst_shape(rng, base_src_shape, broadcast):
+    assert isinstance(broadcast, Broadcast)
+    match broadcast:
+        case Broadcast.NO:
+            return base_src_shape
+        case Broadcast.RIGHT:
+            return base_src_shape + (rng.randint(2, 5),)
+        case Broadcast.RIGHT_LEFT:
+            return (rng.randint(2, 5),) + base_src_shape + (rng.randint(2, 5),)
+        case Broadcast.LEFT:
+            return (rng.randint(2, 5),) + base_src_shape
+    raise ValueError(f"Invalid broadcast: {broadcast}")
+
+
+def is_h2h_needed(direction, src_shape, src_format, dst_shape, dst_format, broadcast):
+    if direction == CopyDirection.D2D:
+        return False
+    elif direction == CopyDirection.H2D:
+        return len(src_shape) > 1 and (src_format == SrcFormat.SLICED) or broadcast != Broadcast.NO
+    else:
+        assert direction == CopyDirection.D2H
+        return len(dst_shape) > 1 and (dst_format == DstFormat.SLICED)
+
+
+def get_src(is_host, src_format, src_order, implicit_c, shape, itemsize):
+    assert isinstance(src_format, SrcFormat)
+    assert isinstance(implicit_c, bool)
+    mod = np if is_host else cp
+    dtype = dtype_from_itemsize(itemsize)
+    slices = None
+    if src_format == SrcFormat.SLICED:
+        shape = tuple(e + 2 for e in shape)
+        slices = tuple(slice(1, -1) for _ in shape)
+    array = mod.arange(np.prod(shape), dtype=dtype).reshape(shape, order=src_order)
+    smv = StridedMemoryView.from_dlpack(array, -1)
+    # enforce implicit C strides
+    if implicit_c and smv.layout.is_contiguous_c:
+        layout = StridedLayout(smv.shape, None, itemsize)
+        smv = smv.view(layout=layout)
+    if slices is not None:
+        base_ptr = smv.ptr
+        smv = smv.view(layout=smv.layout[slices])
+        assert smv.layout.slice_offset != 0
+        assert smv.ptr == base_ptr + smv.layout.slice_offset_in_bytes
+        assert smv.shape != shape
+        assert smv.layout.volume > 0
+    return smv
+
+
+def get_dst(
+    is_host,
+    src_order,
+    transpose,
+    dst_order,
+    implicit_c,
+    dst_format,
+    dst_shape,
+    itemsize,
+):
+    assert isinstance(dst_format, DstFormat)
+    assert isinstance(implicit_c, bool)
+    mod = np if is_host else cp
+    dtype = dtype_from_itemsize(itemsize)
+    slices = None
+    if dst_format == DstFormat.SLICED:
+        dst_shape = tuple(e + 2 for e in dst_shape)
+        slices = tuple(slice(1, -1) for _ in dst_shape)
+    if transpose == Transpose.SAME_ORDER:
+        assert src_order in "CF"
+        assert dst_order == src_order
+    elif transpose == Transpose.INVERSE:
+        assert dst_order in "CF"
+        assert dst_order != src_order
+    else:
+        assert transpose == Transpose.PERMUTATION
+        assert isinstance(dst_order, tuple)
+    if transpose == Transpose.PERMUTATION:
+        array = mod.arange(np.prod(dst_shape), dtype=dtype)
+        array = array.reshape(permuted(dst_shape, dst_order)).transpose(inv_permutation(dst_order))
+    else:
+        array = mod.arange(np.prod(dst_shape), dtype=dtype).reshape(dst_shape, order=dst_order)
+    smv = StridedMemoryView.from_dlpack(array, -1)
+    # enforce implicit C strides
+    if implicit_c and smv.layout.is_contiguous_c:
+        layout = StridedLayout(smv.shape, None, itemsize)
+        smv = smv.view(layout=layout)
+    if slices is not None:
+        base_ptr = smv.ptr
+        smv = smv.view(layout=smv.layout[slices])
+        assert smv.layout.slice_offset != 0
+        assert smv.ptr == base_ptr + smv.layout.slice_offset_in_bytes
+        assert smv.shape != dst_shape
+        assert smv.layout.volume > 0
+    return smv
+
+
+def as_array(device_id, smv):
+    min_offset, max_offset = smv.layout.offset_bounds
+    size = (max_offset - min_offset + 1) * smv.layout.itemsize
+    dtype = smv.dtype
+    if dtype is None:
+        dtype = dtype_from_itemsize(smv.layout.itemsize)
+    if device_id is None:
+        c_mem = memoryview((ctypes.c_char * size).from_address(smv.ptr))
+        np_array = np.frombuffer(c_mem, dtype=dtype)
+        if smv.layout.strides_in_bytes is None:
+            return np_array.reshape(smv.shape, order="C")
+        else:
+            return np.lib.stride_tricks.as_strided(
+                np_array,
+                shape=smv.shape,
+                strides=smv.layout.strides_in_bytes,
+            )
+    else:
+        assert smv.is_device_accessible
+        um = cp.cuda.UnownedMemory(smv.ptr, size, smv, device_id)
+        mem = cp.cuda.MemoryPointer(um, 0)
+        cp_array = cp.ndarray(
+            shape=smv.shape,
+            strides=smv.layout.strides_in_bytes,
+            dtype=dtype,
+            memptr=mem,
+        )
+        return cp_array
+
+
+@pytest.mark.parametrize(
+    (
+        "src_shape",
+        "dst_shape",
+        "direction",
+        "src_format",
+        "transpose",
+        "dst_format",
+        "implicit_c",
+        "broadcast",
+        "itemsize",
+        "device_id",
+        "default_stream",
+        "src_order",
+        "dst_order",
+        "copy_from",
+        "blocking",
+    ),
+    [
+        (
+            NamedParam("src_shape", src_shape),
+            NamedParam("dst_shape", dst_shape),
+            direction,
+            src_format,
+            transpose,
+            dst_format,
+            NamedParam("implicit_c", implicit_c),
+            broadcast,
+            NamedParam("itemsize", py_rng.choice(_ITEMSIZES)),
+            NamedParam(
+                "device_id",
+                py_rng.randint(0, num_devices - 1) if num_devices >= 0 else None,
+            ),
+            NamedParam("default_stream", py_rng.choice([True, False])),
+            NamedParam("src_order", get_src_order(src_format)),
+            NamedParam("dst_order", get_dst_order(py_rng, src_format, transpose, dst_shape)),
+            NamedParam("copy_from", py_rng.choice([True, False])),
+            NamedParam("blocking", py_rng.choice([True, False])),
+        )
+        for base_src_shape in [
+            tuple(),
+            (11,),
+            (16,),
+            (16, 12),
+            (1, 16, 1, 12, 1),
+            (13, 12, 8),
+            (4, 13, 15),
+        ]
+        for direction in list(CopyDirection)
+        for src_format in list(SrcFormat)
+        for transpose in list(Transpose)
+        for dst_format in list(DstFormat)
+        for implicit_c in [True, False]
+        if not implicit_c or src_format == SrcFormat.C_CONTIGUOUS
+        for broadcast in [py_rng.choice([Broadcast.NO, py_rng.choice(list(Broadcast)[1:])])]
+        for src_shape in [get_src_shape(base_src_shape, broadcast)]
+        for dst_shape in [get_dst_shape(py_rng, base_src_shape, broadcast)]
+        if src_format != SrcFormat.SLICED or len(src_shape) > 0
+        if transpose == Transpose.SAME_ORDER or len(dst_shape) > 1
+        if dst_format != DstFormat.SLICED or len(dst_shape) > 0
+    ],
+    ids=pretty_name,
+)
+def test_strided_memory_view_copy(
+    src_shape,
+    dst_shape,
+    direction,
+    src_format,
+    transpose,
+    dst_format,
+    implicit_c,
+    broadcast,
+    itemsize,
+    device_id,
+    default_stream,
+    src_order,
+    dst_order,
+    copy_from,
+    blocking,
+):
+    device_id = device_id.value
+    src_shape = src_shape.value
+    dst_shape = dst_shape.value
+    implicit_c = implicit_c.value
+    itemsize = itemsize.value
+    default_stream = default_stream.value
+    src_order = src_order.value
+    dst_order = dst_order.value
+    copy_from = copy_from.value
+    blocking = blocking.value
+
+    if device_id is None:
+        pytest.skip("No devices available")
+    if cp is None:
+        pytest.skip("cupy is not installed")
+
+    assert isinstance(direction, CopyDirection)
+    is_src_host = direction == CopyDirection.H2D
+    is_dst_host = direction == CopyDirection.D2H
+
+    cp_stream = None
+    stream = None
+    try:
+        with cp.cuda.Device(device_id):
+            if default_stream:
+                stream = ccx.Device(device_id).default_stream
+                cp_stream = cp.cuda.ExternalStream(int(stream.handle), device_id)
+            else:
+                cp_stream = cp.cuda.Stream(non_blocking=True)
+                stream = ccx.Stream.from_handle(cp_stream.ptr)
+
+            with cp_stream:
+                src = get_src(is_src_host, src_format, src_order, implicit_c, src_shape, itemsize)
+                dst = get_dst(
+                    is_dst_host,
+                    src_order,
+                    transpose,
+                    dst_order,
+                    implicit_c,
+                    dst_format,
+                    dst_shape,
+                    itemsize,
+                )
+
+        if not is_src_host:
+            assert src.device_id == device_id
+            assert src.is_device_accessible
+        if not is_dst_host:
+            assert dst.is_device_accessible
+            assert dst.device_id == device_id
+
+        if broadcast != Broadcast.NO:
+            assert src.shape != dst.shape
+        else:
+            assert src.shape == dst.shape
+
+        with with_ccx_device(device_id), with_logger() as log_stream:
+            if copy_from:
+                dst.copy_from(src, stream, blocking=blocking)
+            else:
+                src.copy_to(dst, stream, blocking=blocking)
+            debug_log = log_stream.getvalue()
+
+        if blocking or is_h2h_needed(direction, src_shape, src_format, dst_shape, dst_format, broadcast):
+            assert f"Syncing stream {int(stream.handle)}" in debug_log
+        else:
+            # if no extra H2H is needed we should respect non-blocking flag
+            assert "Syncing stream" not in debug_log
+
+        src_array = as_array(None if is_src_host else device_id, src)
+        dst_array = as_array(None if is_dst_host else device_id, dst)
+        assert src_array.shape == src.shape == src_shape
+        assert dst_array.shape == dst.shape == dst_shape
+        if src.layout.strides_in_bytes is None:
+            dense_strides = StridedLayout.dense(src.shape, src.layout.itemsize).strides_in_bytes
+            assert src_array.strides == dense_strides
+        else:
+            assert src_array.strides == src.layout.strides_in_bytes
+        if dst.layout.strides_in_bytes is None:
+            dense_strides = StridedLayout.dense(dst.shape, dst.layout.itemsize).strides_in_bytes
+            assert dst_array.strides == dense_strides
+        else:
+            assert dst_array.strides == dst.layout.strides_in_bytes
+
+        with cp.cuda.Device(device_id):
+            if not blocking:
+                stream.sync()
+            if not is_src_host:
+                src_array = cp.asnumpy(src_array)
+            if not is_dst_host:
+                dst_array = cp.asnumpy(dst_array)
+
+        if broadcast != Broadcast.NO:
+            src_array = np.broadcast_to(src_array, dst_array.shape)
+        np.testing.assert_equal(src_array, dst_array)
+
+    finally:
+        if not default_stream and stream is not None:
+            stream.close()
+
+
+class CustomDeviceAllocator(ccx.MemoryResource):
+    def __init__(self, device: ccx.Device):
+        self.device = device
+        self._mr = device.memory_resource
+        self._recorded = []
+
+    def allocate(self, size, stream=None):
+        self._recorded.append((size, stream))
+        return self._mr.allocate(size, stream)
+
+    def deallocate(self, ptr, size, stream=None):
+        self._mr.deallocate(ptr, size, stream)
+
+    @property
+    def is_device_accessible(self) -> bool:
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        return False
+
+    @property
+    def device_id(self) -> int:
+        return self.device.device_id
+
+
+class CustomHostAllocator(ccx.LegacyPinnedMemoryResource):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self._recorded = []
+
+    def allocate(self, size, stream=None):
+        self._recorded.append((size, stream))
+        return super().allocate(size, stream)
+
+
+@pytest.mark.parametrize(
+    ("direction", "custom_allocator", "use_dataclass", "dtype", "device_id"),
+    [
+        (
+            direction,
+            custom_allocator,
+            use_dataclass,
+            py_rng.choice([np.float16, np.float32, np.float64, np.complex64]),
+            NamedParam(
+                "device_id",
+                py_rng.randint(0, num_devices - 1) if num_devices >= 0 else None,
+            ),
+        )
+        for direction in [CopyDirection.H2D, CopyDirection.D2H]
+        for custom_allocator in [
+            CustomAllocator.HOST,
+            CustomAllocator.DEVICE,
+            CustomAllocator.BOTH,
+        ]
+        for use_dataclass in [True, False]
+    ],
+    ids=pretty_name,
+)
+def test_custom_allocator(direction, custom_allocator, use_dataclass, dtype, device_id):
+    if cp is None:
+        pytest.skip("cupy is not installed")
+
+    device_id = device_id.value
+
+    assert isinstance(direction, CopyDirection)
+    is_src_host = direction == CopyDirection.H2D
+    is_dst_host = direction == CopyDirection.D2H
+    src_mod = np if is_src_host else cp
+    dst_mod = np if is_dst_host else cp
+
+    shape = (111, 122)
+    with cp.cuda.Device(device_id):
+        a = src_mod.arange(np.prod(shape), dtype=dtype).reshape(shape, order="C")
+        a = a[:, ::-2]
+        b = dst_mod.arange(np.prod(shape), dtype=dtype).reshape(shape, order="F")
+        b = b[::-1, : shape[1] // 2]
+
+    nbytes = a.nbytes
+    src_buf = Buffer.from_handle(get_ptr(a), nbytes, owner=a)
+    dst_buf = Buffer.from_handle(get_ptr(b), nbytes, owner=b)
+    src_layout = StridedLayout(a.shape, a.strides, a.itemsize, divide_strides=True)
+    dst_layout = StridedLayout(b.shape, b.strides, b.itemsize, divide_strides=True)
+    src = StridedMemoryView.from_buffer(src_buf, src_layout)
+    dst = StridedMemoryView.from_buffer(dst_buf, dst_layout)
+
+    host_allocator = None
+    device_allocator = None
+    if custom_allocator in [CustomAllocator.HOST, CustomAllocator.BOTH]:
+        host_allocator = CustomHostAllocator()
+    if custom_allocator in [CustomAllocator.DEVICE, CustomAllocator.BOTH]:
+        device_allocator = CustomDeviceAllocator(ccx.Device(device_id))
+
+    if use_dataclass:
+        allocator = CopyAllocatorOptions(host=host_allocator, device=device_allocator)
+    else:
+        allocator = {k: v for k, v in [("host", host_allocator), ("device", device_allocator)] if v is not None}
+
+    with with_ccx_device(device_id) as dev:
+        stream = dev.default_stream
+        dst.copy_from(src, stream, blocking=True, allocator=allocator)
+
+    if host_allocator is not None:
+        assert host_allocator._recorded == [(nbytes, stream)]
+    if device_allocator is not None:
+        assert device_allocator._recorded == [(nbytes, stream)]
+
+    src_array = as_array(None if is_src_host else device_id, src)
+    dst_array = as_array(None if is_dst_host else device_id, dst)
+    with cp.cuda.Device(device_id):
+        if not is_src_host:
+            src_array = cp.asnumpy(src_array)
+        if not is_dst_host:
+            dst_array = cp.asnumpy(dst_array)
+    np.testing.assert_equal(src_array, dst_array)
+
+
+def test_wrong_shape():
+    a = np.arange(10).reshape((2, 5))
+    d = ccx.Device(0)
+    d.set_current()
+    layout = StridedLayout.dense((2, 6), a.itemsize)
+    buf = d.memory_resource.allocate(layout.required_size_in_bytes())
+    a_view = StridedMemoryView.from_dlpack(a, -1)
+    b_view = StridedMemoryView.from_buffer(buf, layout)
+    with pytest.raises(ValueError, match="cannot be broadcast together"):
+        b_view.copy_from(a_view, d.default_stream, blocking=True)
+    with pytest.raises(ValueError, match="cannot be broadcast together"):
+        a_view.copy_from(b_view, d.default_stream, blocking=True)
+    with pytest.raises(ValueError, match="cannot be broadcast together"):
+        a_view.copy_to(b_view, d.default_stream, blocking=True)
+    with pytest.raises(ValueError, match="cannot be broadcast together"):
+        b_view.copy_to(a_view, d.default_stream, blocking=True)
+
+
+def test_wrong_dtype():
+    a = np.arange(10, dtype=np.int32).reshape((2, 5))
+    d = ccx.Device(0)
+    d.set_current()
+    layout = StridedLayout.dense((2, 5), a.itemsize)
+    buf = d.memory_resource.allocate(layout.required_size_in_bytes())
+    a_view = StridedMemoryView.from_dlpack(a, -1)
+    b_view = StridedMemoryView.from_buffer(buf, layout, dtype=np.float32)
+    with pytest.raises(ValueError, match="destination and source dtypes"):
+        b_view.copy_from(a_view, d.default_stream, blocking=True)
+    with pytest.raises(ValueError, match="destination and source dtypes"):
+        a_view.copy_to(b_view, d.default_stream, blocking=True)
+
+
+def test_wrong_itemsize():
+    a = np.arange(10, dtype=np.int32).reshape((2, 5))
+    d = ccx.Device(0)
+    d.set_current()
+    layout = StridedLayout.dense((2, 5), 8)
+    buf = d.memory_resource.allocate(layout.required_size_in_bytes())
+    a_view = StridedMemoryView.from_dlpack(a, -1)
+    b_view = StridedMemoryView.from_buffer(buf, layout)
+    with pytest.raises(ValueError, match="itemsize"):
+        b_view.copy_from(a_view, d.default_stream, blocking=True)
+    with pytest.raises(ValueError, match="itemsize"):
+        a_view.copy_to(b_view, d.default_stream, blocking=True)
+
+
+def test_overlapping_dst():
+    a = np.arange(10)
+    a = np.lib.stride_tricks.sliding_window_view(a, 3, -1)
+    # do this manually, as through dlpack numpy marks the view readonly
+    buf = Buffer.from_handle(a.ctypes.data, a.nbytes, owner=a)
+    layout = StridedLayout(a.shape, a.strides, a.itemsize, divide_strides=True)
+    host_overlapping_view = StridedMemoryView.from_buffer(buf, layout)
+    layout = host_overlapping_view.layout
+    d = ccx.Device(0)
+    d.set_current()
+    buf = d.memory_resource.allocate(10 * a.itemsize)
+    dev_overlapping_view = StridedMemoryView.from_buffer(buf, host_overlapping_view.layout)
+    dense_layout = layout.to_dense()
+    dense_buf = d.memory_resource.allocate(dense_layout.required_size_in_bytes())
+    dense_view = StridedMemoryView.from_buffer(dense_buf, dense_layout)
+    with pytest.raises(ValueError, match="destination layout is non-unique"):
+        dense_view.copy_to(host_overlapping_view, d.default_stream, blocking=True)
+    with pytest.raises(ValueError, match="destination layout is non-unique"):
+        dense_view.copy_to(dev_overlapping_view, d.default_stream, blocking=True)