From 3026bdef9a31691cc1a2180a5bbfa61d09c74eea Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 4 Jun 2025 12:25:07 -0500
Subject: [PATCH 1/4] Remove clean_up_cffi_files fixture

---
 cuda_core/tests/conftest.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 5bd5b52b73..4d615d73da 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -1,9 +1,7 @@
 # Copyright 2024 NVIDIA Corporation.  All rights reserved.
 # SPDX-License-Identifier: Apache-2.0
 
-import glob
 import os
-import sys
 
 try:
     from cuda.bindings import driver
@@ -67,21 +65,6 @@ def pop_all_contexts():
     return pop_all_contexts
 
 
-# samples relying on cffi could fail as the modules cannot be imported
-sys.path.append(os.getcwd())
-
-
-@pytest.fixture(scope="session", autouse=True)
-def clean_up_cffi_files():
-    yield
-    files = glob.glob(os.path.join(os.getcwd(), "_cpu_obj*"))
-    for f in files:
-        try:  # noqa: SIM105
-            os.remove(f)
-        except FileNotFoundError:
-            pass  # noqa: SIM105
-
-
 skipif_testing_with_compute_sanitizer = pytest.mark.skipif(
     os.environ.get("CUDA_PYTHON_TESTING_WITH_COMPUTE_SANITIZER", "0") == "1",
     reason="The compute-sanitizer is running, and this test causes an API error.",

From 9303966970ce70752a20577d490af1bd33417eb1 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 4 Jun 2025 12:25:40 -0500
Subject: [PATCH 2/4] examples/strided_memory_view.py to clean up FFI resources

---
 cuda_core/examples/strided_memory_view.py | 58 +++++++++++++++--------
 1 file changed, 37 insertions(+), 21 deletions(-)

diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py
index a145f7a4e3..c0f11d6eb7 100644
--- a/cuda_core/examples/strided_memory_view.py
+++ b/cuda_core/examples/strided_memory_view.py
@@ -16,8 +16,10 @@
 # ################################################################################
 
 import importlib
+import shutil
 import string
 import sys
+import tempfile
 
 try:
     from cffi import FFI
@@ -73,8 +75,15 @@
         source_extension=".cpp",
         extra_compile_args=["-std=c++11"],
     )
-    cpu_prog.compile()
-    cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name)
+    temp_dir = tempfile.mkdtemp()
+    cpu_prog.compile(tmpdir=temp_dir)
+    saved_sys_path = sys.path
+    try:
+        sys.path.append(temp_dir)
+        cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name)
+    finally:
+        sys.path = saved_sys_path
+        shutil.rmtree(temp_dir)
 
 # Here is a concrete (again, very naive!) implementation on GPU:
 if cp:
@@ -140,29 +149,36 @@ def my_func(arr, work_stream):
 
 # This takes the CPU path
 if FFI:
-    # Create input array on CPU
-    arr_cpu = np.zeros(1024, dtype=np.int32)
-    print(f"before: {arr_cpu[:10]=}")
+    try:
+        # Create input array on CPU
+        arr_cpu = np.zeros(1024, dtype=np.int32)
+        print(f"before: {arr_cpu[:10]=}")
 
-    # Run the workload
-    my_func(arr_cpu, None)
+        # Run the workload
+        my_func(arr_cpu, None)
 
-    # Check the result
-    print(f"after: {arr_cpu[:10]=}")
-    assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32))
+        # Check the result
+        print(f"after: {arr_cpu[:10]=}")
+        assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32))
+    finally:
+        # clean up temp directory
+        del cpu_func
+        shutil.rmtree(temp_dir)
 
 
 # This takes the GPU path
 if cp:
     s = dev.create_stream()
-    # Create input array on GPU
-    arr_gpu = cp.ones(1024, dtype=cp.int32)
-    print(f"before: {arr_gpu[:10]=}")
-
-    # Run the workload
-    my_func(arr_gpu, s)
-
-    # Check the result
-    print(f"after: {arr_gpu[:10]=}")
-    assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
-    s.close()
+    try:
+        # Create input array on GPU
+        arr_gpu = cp.ones(1024, dtype=cp.int32)
+        print(f"before: {arr_gpu[:10]=}")
+
+        # Run the workload
+        my_func(arr_gpu, s)
+
+        # Check the result
+        print(f"after: {arr_gpu[:10]=}")
+        assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
+    finally:
+        s.close()

From 5775dda65098ef3ef1243cb9b09f8df12dbe0014 Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 4 Jun 2025 13:22:34 -0500
Subject: [PATCH 3/4] Ensure temporary is delete is compile fails

Make sure to del cpu_func and my_func that references it.
---
 cuda_core/examples/strided_memory_view.py | 44 ++++++++++++-----------
 1 file changed, 24 insertions(+), 20 deletions(-)

diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py
index c0f11d6eb7..f9a0b9a255 100644
--- a/cuda_core/examples/strided_memory_view.py
+++ b/cuda_core/examples/strided_memory_view.py
@@ -76,7 +76,10 @@
         extra_compile_args=["-std=c++11"],
     )
     temp_dir = tempfile.mkdtemp()
-    cpu_prog.compile(tmpdir=temp_dir)
+    try:
+        cpu_prog.compile(tmpdir=temp_dir)
+    finally:
+        shutil.rmtree(temp_dir)
     saved_sys_path = sys.path
     try:
         sys.path.append(temp_dir)
@@ -147,25 +150,6 @@ def my_func(arr, work_stream):
         cpu_func(cpu_prog.cast("int*", view.ptr), size)
 
 
-# This takes the CPU path
-if FFI:
-    try:
-        # Create input array on CPU
-        arr_cpu = np.zeros(1024, dtype=np.int32)
-        print(f"before: {arr_cpu[:10]=}")
-
-        # Run the workload
-        my_func(arr_cpu, None)
-
-        # Check the result
-        print(f"after: {arr_cpu[:10]=}")
-        assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32))
-    finally:
-        # clean up temp directory
-        del cpu_func
-        shutil.rmtree(temp_dir)
-
-
 # This takes the GPU path
 if cp:
     s = dev.create_stream()
@@ -182,3 +166,23 @@ def my_func(arr, work_stream):
         assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
     finally:
         s.close()
+
+# This takes the CPU path
+if FFI:
+    try:
+        # Create input array on CPU
+        arr_cpu = np.zeros(1024, dtype=np.int32)
+        print(f"before: {arr_cpu[:10]=}")
+
+        # Run the workload
+        my_func(arr_cpu, None)
+
+        # Check the result
+        print(f"after: {arr_cpu[:10]=}")
+        assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32))
+    finally:
+        # to allow FFI module to unload, we delete references to
+        # to cpu_func
+        del cpu_func, my_func
+        # clean up temp directory
+        shutil.rmtree(temp_dir)

From 380240568641d8108dd50a2720d843c7eb497e4b Mon Sep 17 00:00:00 2001
From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com>
Date: Wed, 4 Jun 2025 17:15:11 -0500
Subject: [PATCH 4/4] Consolidated example for strided_memory_view_cpu/gpu to
 single run functions

---
 ...ory_view.py => strided_memory_view_cpu.py} | 143 ++++++------------
 cuda_core/examples/strided_memory_view_gpu.py | 129 ++++++++++++++++
 2 files changed, 174 insertions(+), 98 deletions(-)
 rename cuda_core/examples/{strided_memory_view.py => strided_memory_view_cpu.py} (61%)
 create mode 100644 cuda_core/examples/strided_memory_view_gpu.py

diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view_cpu.py
similarity index 61%
rename from cuda_core/examples/strided_memory_view.py
rename to cuda_core/examples/strided_memory_view_cpu.py
index f9a0b9a255..0fa8f38e6c 100644
--- a/cuda_core/examples/strided_memory_view.py
+++ b/cuda_core/examples/strided_memory_view_cpu.py
@@ -26,14 +26,8 @@
 except ImportError:
     print("cffi is not installed, the CPU example will be skipped", file=sys.stderr)
     FFI = None
-try:
-    import cupy as cp
-except ImportError:
-    print("cupy is not installed, the GPU example will be skipped", file=sys.stderr)
-    cp = None
 import numpy as np
 
-from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
 from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
 
 # ################################################################################
@@ -54,8 +48,43 @@
 func_name = "inplace_plus_arange_N"
 func_sig = f"void {func_name}(int* data, size_t N)"
 
-# Here is a concrete (very naive!) implementation on CPU:
-if FFI:
+
+# Now we are prepared to run the code from the user's perspective!
+#
+# ################################################################################
+
+
+# Below, as a user we want to perform the said in-place operation on a CPU
+# or GPU, by calling the corresponding function implemented "elsewhere"
+# (in the body of run function).
+
+
+# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
+# of which are supported by StridedMemoryView).
+@args_viewable_as_strided_memory((0,))
+def my_func(arr):
+    global cpu_func
+    global cpu_prog
+    # Create a memory view over arr (assumed to be a 1D array of int32). The stream
+    # ordering is taken care of, so that arr can be safely accessed on our work
+    # stream (ordered after a data stream on which arr is potentially prepared).
+    view = arr.view(-1)
+    assert isinstance(view, StridedMemoryView)
+    assert len(view.shape) == 1
+    assert view.dtype == np.int32
+    assert not view.is_device_accessible
+
+    size = view.shape[0]
+    # DLPack also supports host arrays. We want to know if the array data is
+    # accessible from the GPU, and dispatch to the right routine accordingly.
+    cpu_func(cpu_prog.cast("int*", view.ptr), size)
+
+
+def run():
+    global my_func
+    if not FFI:
+        return
+    # Here is a concrete (very naive!) implementation on CPU:
     cpu_code = string.Template(r"""
     extern "C"
     $func_sig {
@@ -76,113 +105,31 @@
         extra_compile_args=["-std=c++11"],
     )
     temp_dir = tempfile.mkdtemp()
+    saved_sys_path = sys.path.copy()
     try:
         cpu_prog.compile(tmpdir=temp_dir)
-    finally:
-        shutil.rmtree(temp_dir)
-    saved_sys_path = sys.path
-    try:
+
         sys.path.append(temp_dir)
         cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name)
-    finally:
-        sys.path = saved_sys_path
-        shutil.rmtree(temp_dir)
-
-# Here is a concrete (again, very naive!) implementation on GPU:
-if cp:
-    gpu_code = string.Template(r"""
-    extern "C"
-    __global__ $func_sig {
-        const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
-        const size_t stride_size = gridDim.x * blockDim.x;
-        for (size_t i = tid; i < N; i += stride_size) {
-            data[i] += i;
-        }
-    }
-    """).substitute(func_sig=func_sig)
-
-    # To know the GPU's compute capability, we need to identify which GPU to use.
-    dev = Device(0)
-    dev.set_current()
-    arch = "".join(f"{i}" for i in dev.compute_capability)
-    gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11"))
-    mod = gpu_prog.compile(target_type="cubin")
-    gpu_ker = mod.get_kernel(func_name)
-
-# Now we are prepared to run the code from the user's perspective!
-#
-# ################################################################################
-
 
-# Below, as a user we want to perform the said in-place operation on either CPU
-# or GPU, by calling the corresponding function implemented "elsewhere" (done above).
-
-
-# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
-# of which are supported by StridedMemoryView).
-@args_viewable_as_strided_memory((0,))
-def my_func(arr, work_stream):
-    # Create a memory view over arr (assumed to be a 1D array of int32). The stream
-    # ordering is taken care of, so that arr can be safely accessed on our work
-    # stream (ordered after a data stream on which arr is potentially prepared).
-    view = arr.view(work_stream.handle if work_stream else -1)
-    assert isinstance(view, StridedMemoryView)
-    assert len(view.shape) == 1
-    assert view.dtype == np.int32
-
-    size = view.shape[0]
-    # DLPack also supports host arrays. We want to know if the array data is
-    # accessible from the GPU, and dispatch to the right routine accordingly.
-    if view.is_device_accessible:
-        block = 256
-        grid = (size + block - 1) // block
-        config = LaunchConfig(grid=grid, block=block)
-        launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
-        # Here we're being conservative and synchronize over our work stream,
-        # assuming we do not know the data stream; if we know then we could
-        # just order the data stream after the work stream here, e.g.
-        #
-        #   data_stream.wait(work_stream)
-        #
-        # without an expensive synchronization (with respect to the host).
-        work_stream.sync()
-    else:
-        cpu_func(cpu_prog.cast("int*", view.ptr), size)
-
-
-# This takes the GPU path
-if cp:
-    s = dev.create_stream()
-    try:
-        # Create input array on GPU
-        arr_gpu = cp.ones(1024, dtype=cp.int32)
-        print(f"before: {arr_gpu[:10]=}")
-
-        # Run the workload
-        my_func(arr_gpu, s)
-
-        # Check the result
-        print(f"after: {arr_gpu[:10]=}")
-        assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
-    finally:
-        s.close()
-
-# This takes the CPU path
-if FFI:
-    try:
         # Create input array on CPU
         arr_cpu = np.zeros(1024, dtype=np.int32)
         print(f"before: {arr_cpu[:10]=}")
 
         # Run the workload
-        my_func(arr_cpu, None)
+        my_func(arr_cpu)
 
         # Check the result
         print(f"after: {arr_cpu[:10]=}")
         assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32))
     finally:
+        sys.path = saved_sys_path
         # to allow FFI module to unload, we delete references to
         # to cpu_func
         del cpu_func, my_func
         # clean up temp directory
         shutil.rmtree(temp_dir)
+
+
+if __name__ == "__main__":
+    run()
diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py
new file mode 100644
index 0000000000..10d12fd30c
--- /dev/null
+++ b/cuda_core/examples/strided_memory_view_gpu.py
@@ -0,0 +1,129 @@
+# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+# ################################################################################
+#
+# This demo aims to illustrate two takeaways:
+#
+#   1. The similarity between CPU and GPU JIT-compilation with C++ sources
+#   2. How to use StridedMemoryView to interface with foreign C/C++ functions
+#
+# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU
+# path, which can be easily installed from pip or conda following their instructions.
+# We also use NumPy/CuPy as the CPU/GPU array container.
+#
+# ################################################################################
+
+import string
+import sys
+
+try:
+    import cupy as cp
+except ImportError:
+    print("cupy is not installed, the GPU example will be skipped", file=sys.stderr)
+    cp = None
+import numpy as np
+
+from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch
+from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory
+
+# ################################################################################
+#
+# Usually this entire code block is in a separate file, built as a Python extension
+# module that can be imported by users at run time. For illustrative purposes we
+# use JIT compilation to make this demo self-contained.
+#
+# Here we assume an in-place operation, equivalent to the following NumPy code:
+#
+#   >>> arr = ...
+#   >>> assert arr.dtype == np.int32
+#   >>> assert arr.ndim == 1
+#   >>> arr += np.arange(arr.size, dtype=arr.dtype)
+#
+# is implemented for both CPU and GPU at low-level, with the following C function
+# signature:
+func_name = "inplace_plus_arange_N"
+func_sig = f"void {func_name}(int* data, size_t N)"
+
+# Now we are prepared to run the code from the user's perspective!
+#
+# ################################################################################
+
+
+# Below, as a user we want to perform the said in-place operation on either CPU
+# or GPU, by calling the corresponding function implemented "elsewhere" (done above).
+
+
+# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both
+# of which are supported by StridedMemoryView).
+@args_viewable_as_strided_memory((0,))
+def my_func(arr, work_stream, gpu_ker):
+    # Create a memory view over arr (assumed to be a 1D array of int32). The stream
+    # ordering is taken care of, so that arr can be safely accessed on our work
+    # stream (ordered after a data stream on which arr is potentially prepared).
+    view = arr.view(work_stream.handle if work_stream else -1)
+    assert isinstance(view, StridedMemoryView)
+    assert len(view.shape) == 1
+    assert view.dtype == np.int32
+    assert view.is_device_accessible
+
+    size = view.shape[0]
+    # DLPack also supports host arrays. We want to know if the array data is
+    # accessible from the GPU, and dispatch to the right routine accordingly.
+    block = 256
+    grid = (size + block - 1) // block
+    config = LaunchConfig(grid=grid, block=block)
+    launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size))
+    # Here we're being conservative and synchronize over our work stream,
+    # assuming we do not know the data stream; if we know then we could
+    # just order the data stream after the work stream here, e.g.
+    #
+    #   data_stream.wait(work_stream)
+    #
+    # without an expensive synchronization (with respect to the host).
+    work_stream.sync()
+
+
+def run():
+    global my_func
+    if not cp:
+        return None
+    # Here is a concrete (very naive!) implementation on GPU:
+    gpu_code = string.Template(r"""
+    extern "C"
+    __global__ $func_sig {
+        const size_t tid = threadIdx.x + blockIdx.x * blockDim.x;
+        const size_t stride_size = gridDim.x * blockDim.x;
+        for (size_t i = tid; i < N; i += stride_size) {
+            data[i] += i;
+        }
+    }
+    """).substitute(func_sig=func_sig)
+
+    # To know the GPU's compute capability, we need to identify which GPU to use.
+    dev = Device(0)
+    dev.set_current()
+    arch = "".join(f"{i}" for i in dev.compute_capability)
+    gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11"))
+    mod = gpu_prog.compile(target_type="cubin")
+    gpu_ker = mod.get_kernel(func_name)
+
+    s = dev.create_stream()
+    try:
+        # Create input array on GPU
+        arr_gpu = cp.ones(1024, dtype=cp.int32)
+        print(f"before: {arr_gpu[:10]=}")
+
+        # Run the workload
+        my_func(arr_gpu, s, gpu_ker)
+
+        # Check the result
+        print(f"after: {arr_gpu[:10]=}")
+        assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32))
+    finally:
+        s.close()
+
+
+if __name__ == "__main__":
+    run()