From 3026bdef9a31691cc1a2180a5bbfa61d09c74eea Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 4 Jun 2025 12:25:07 -0500 Subject: [PATCH 1/4] Remove clean_up_cffi_files fixture --- cuda_core/tests/conftest.py | 17 ----------------- 1 file changed, 17 deletions(-) diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index 5bd5b52b73..4d615d73da 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -1,9 +1,7 @@ # Copyright 2024 NVIDIA Corporation. All rights reserved. # SPDX-License-Identifier: Apache-2.0 -import glob import os -import sys try: from cuda.bindings import driver @@ -67,21 +65,6 @@ def pop_all_contexts(): return pop_all_contexts -# samples relying on cffi could fail as the modules cannot be imported -sys.path.append(os.getcwd()) - - -@pytest.fixture(scope="session", autouse=True) -def clean_up_cffi_files(): - yield - files = glob.glob(os.path.join(os.getcwd(), "_cpu_obj*")) - for f in files: - try: # noqa: SIM105 - os.remove(f) - except FileNotFoundError: - pass # noqa: SIM105 - - skipif_testing_with_compute_sanitizer = pytest.mark.skipif( os.environ.get("CUDA_PYTHON_TESTING_WITH_COMPUTE_SANITIZER", "0") == "1", reason="The compute-sanitizer is running, and this test causes an API error.", From 9303966970ce70752a20577d490af1bd33417eb1 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 4 Jun 2025 12:25:40 -0500 Subject: [PATCH 2/4] examples/strided_memory_view.py to clean up FFI resources --- cuda_core/examples/strided_memory_view.py | 58 +++++++++++++++-------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py index a145f7a4e3..c0f11d6eb7 100644 --- a/cuda_core/examples/strided_memory_view.py +++ b/cuda_core/examples/strided_memory_view.py @@ -16,8 +16,10 @@ # ################################################################################ import importlib +import shutil import string import sys +import tempfile try: from cffi import FFI @@ -73,8 +75,15 @@ source_extension=".cpp", extra_compile_args=["-std=c++11"], ) - cpu_prog.compile() - cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name) + temp_dir = tempfile.mkdtemp() + cpu_prog.compile(tmpdir=temp_dir) + saved_sys_path = sys.path + try: + sys.path.append(temp_dir) + cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name) + finally: + sys.path = saved_sys_path + shutil.rmtree(temp_dir) # Here is a concrete (again, very naive!) implementation on GPU: if cp: @@ -140,29 +149,36 @@ def my_func(arr, work_stream): # This takes the CPU path if FFI: - # Create input array on CPU - arr_cpu = np.zeros(1024, dtype=np.int32) - print(f"before: {arr_cpu[:10]=}") + try: + # Create input array on CPU + arr_cpu = np.zeros(1024, dtype=np.int32) + print(f"before: {arr_cpu[:10]=}") - # Run the workload - my_func(arr_cpu, None) + # Run the workload + my_func(arr_cpu, None) - # Check the result - print(f"after: {arr_cpu[:10]=}") - assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32)) + # Check the result + print(f"after: {arr_cpu[:10]=}") + assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32)) + finally: + # clean up temp directory + del cpu_func + shutil.rmtree(temp_dir) # This takes the GPU path if cp: s = dev.create_stream() - # Create input array on GPU - arr_gpu = cp.ones(1024, dtype=cp.int32) - print(f"before: {arr_gpu[:10]=}") - - # Run the workload - my_func(arr_gpu, s) - - # Check the result - print(f"after: {arr_gpu[:10]=}") - assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32)) - s.close() + try: + # Create input array on GPU + arr_gpu = cp.ones(1024, dtype=cp.int32) + print(f"before: {arr_gpu[:10]=}") + + # Run the workload + my_func(arr_gpu, s) + + # Check the result + print(f"after: {arr_gpu[:10]=}") + assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32)) + finally: + s.close() From 5775dda65098ef3ef1243cb9b09f8df12dbe0014 Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 4 Jun 2025 13:22:34 -0500 Subject: [PATCH 3/4] Ensure temporary is delete is compile fails Make sure to del cpu_func and my_func that references it. --- cuda_core/examples/strided_memory_view.py | 44 ++++++++++++----------- 1 file changed, 24 insertions(+), 20 deletions(-) diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view.py index c0f11d6eb7..f9a0b9a255 100644 --- a/cuda_core/examples/strided_memory_view.py +++ b/cuda_core/examples/strided_memory_view.py @@ -76,7 +76,10 @@ extra_compile_args=["-std=c++11"], ) temp_dir = tempfile.mkdtemp() - cpu_prog.compile(tmpdir=temp_dir) + try: + cpu_prog.compile(tmpdir=temp_dir) + finally: + shutil.rmtree(temp_dir) saved_sys_path = sys.path try: sys.path.append(temp_dir) @@ -147,25 +150,6 @@ def my_func(arr, work_stream): cpu_func(cpu_prog.cast("int*", view.ptr), size) -# This takes the CPU path -if FFI: - try: - # Create input array on CPU - arr_cpu = np.zeros(1024, dtype=np.int32) - print(f"before: {arr_cpu[:10]=}") - - # Run the workload - my_func(arr_cpu, None) - - # Check the result - print(f"after: {arr_cpu[:10]=}") - assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32)) - finally: - # clean up temp directory - del cpu_func - shutil.rmtree(temp_dir) - - # This takes the GPU path if cp: s = dev.create_stream() @@ -182,3 +166,23 @@ def my_func(arr, work_stream): assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32)) finally: s.close() + +# This takes the CPU path +if FFI: + try: + # Create input array on CPU + arr_cpu = np.zeros(1024, dtype=np.int32) + print(f"before: {arr_cpu[:10]=}") + + # Run the workload + my_func(arr_cpu, None) + + # Check the result + print(f"after: {arr_cpu[:10]=}") + assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32)) + finally: + # to allow FFI module to unload, we delete references to + # to cpu_func + del cpu_func, my_func + # clean up temp directory + shutil.rmtree(temp_dir) From 380240568641d8108dd50a2720d843c7eb497e4b Mon Sep 17 00:00:00 2001 From: Oleksandr Pavlyk <21087696+oleksandr-pavlyk@users.noreply.github.com> Date: Wed, 4 Jun 2025 17:15:11 -0500 Subject: [PATCH 4/4] Consolidated example for strided_memory_view_cpu/gpu to single run functions --- ...ory_view.py => strided_memory_view_cpu.py} | 143 ++++++------------ cuda_core/examples/strided_memory_view_gpu.py | 129 ++++++++++++++++ 2 files changed, 174 insertions(+), 98 deletions(-) rename cuda_core/examples/{strided_memory_view.py => strided_memory_view_cpu.py} (61%) create mode 100644 cuda_core/examples/strided_memory_view_gpu.py diff --git a/cuda_core/examples/strided_memory_view.py b/cuda_core/examples/strided_memory_view_cpu.py similarity index 61% rename from cuda_core/examples/strided_memory_view.py rename to cuda_core/examples/strided_memory_view_cpu.py index f9a0b9a255..0fa8f38e6c 100644 --- a/cuda_core/examples/strided_memory_view.py +++ b/cuda_core/examples/strided_memory_view_cpu.py @@ -26,14 +26,8 @@ except ImportError: print("cffi is not installed, the CPU example will be skipped", file=sys.stderr) FFI = None -try: - import cupy as cp -except ImportError: - print("cupy is not installed, the GPU example will be skipped", file=sys.stderr) - cp = None import numpy as np -from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory # ################################################################################ @@ -54,8 +48,43 @@ func_name = "inplace_plus_arange_N" func_sig = f"void {func_name}(int* data, size_t N)" -# Here is a concrete (very naive!) implementation on CPU: -if FFI: + +# Now we are prepared to run the code from the user's perspective! +# +# ################################################################################ + + +# Below, as a user we want to perform the said in-place operation on a CPU +# or GPU, by calling the corresponding function implemented "elsewhere" +# (in the body of run function). + + +# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both +# of which are supported by StridedMemoryView). +@args_viewable_as_strided_memory((0,)) +def my_func(arr): + global cpu_func + global cpu_prog + # Create a memory view over arr (assumed to be a 1D array of int32). The stream + # ordering is taken care of, so that arr can be safely accessed on our work + # stream (ordered after a data stream on which arr is potentially prepared). + view = arr.view(-1) + assert isinstance(view, StridedMemoryView) + assert len(view.shape) == 1 + assert view.dtype == np.int32 + assert not view.is_device_accessible + + size = view.shape[0] + # DLPack also supports host arrays. We want to know if the array data is + # accessible from the GPU, and dispatch to the right routine accordingly. + cpu_func(cpu_prog.cast("int*", view.ptr), size) + + +def run(): + global my_func + if not FFI: + return + # Here is a concrete (very naive!) implementation on CPU: cpu_code = string.Template(r""" extern "C" $func_sig { @@ -76,113 +105,31 @@ extra_compile_args=["-std=c++11"], ) temp_dir = tempfile.mkdtemp() + saved_sys_path = sys.path.copy() try: cpu_prog.compile(tmpdir=temp_dir) - finally: - shutil.rmtree(temp_dir) - saved_sys_path = sys.path - try: + sys.path.append(temp_dir) cpu_func = getattr(importlib.import_module("_cpu_obj.lib"), func_name) - finally: - sys.path = saved_sys_path - shutil.rmtree(temp_dir) - -# Here is a concrete (again, very naive!) implementation on GPU: -if cp: - gpu_code = string.Template(r""" - extern "C" - __global__ $func_sig { - const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; - const size_t stride_size = gridDim.x * blockDim.x; - for (size_t i = tid; i < N; i += stride_size) { - data[i] += i; - } - } - """).substitute(func_sig=func_sig) - - # To know the GPU's compute capability, we need to identify which GPU to use. - dev = Device(0) - dev.set_current() - arch = "".join(f"{i}" for i in dev.compute_capability) - gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11")) - mod = gpu_prog.compile(target_type="cubin") - gpu_ker = mod.get_kernel(func_name) - -# Now we are prepared to run the code from the user's perspective! -# -# ################################################################################ - -# Below, as a user we want to perform the said in-place operation on either CPU -# or GPU, by calling the corresponding function implemented "elsewhere" (done above). - - -# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both -# of which are supported by StridedMemoryView). -@args_viewable_as_strided_memory((0,)) -def my_func(arr, work_stream): - # Create a memory view over arr (assumed to be a 1D array of int32). The stream - # ordering is taken care of, so that arr can be safely accessed on our work - # stream (ordered after a data stream on which arr is potentially prepared). - view = arr.view(work_stream.handle if work_stream else -1) - assert isinstance(view, StridedMemoryView) - assert len(view.shape) == 1 - assert view.dtype == np.int32 - - size = view.shape[0] - # DLPack also supports host arrays. We want to know if the array data is - # accessible from the GPU, and dispatch to the right routine accordingly. - if view.is_device_accessible: - block = 256 - grid = (size + block - 1) // block - config = LaunchConfig(grid=grid, block=block) - launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size)) - # Here we're being conservative and synchronize over our work stream, - # assuming we do not know the data stream; if we know then we could - # just order the data stream after the work stream here, e.g. - # - # data_stream.wait(work_stream) - # - # without an expensive synchronization (with respect to the host). - work_stream.sync() - else: - cpu_func(cpu_prog.cast("int*", view.ptr), size) - - -# This takes the GPU path -if cp: - s = dev.create_stream() - try: - # Create input array on GPU - arr_gpu = cp.ones(1024, dtype=cp.int32) - print(f"before: {arr_gpu[:10]=}") - - # Run the workload - my_func(arr_gpu, s) - - # Check the result - print(f"after: {arr_gpu[:10]=}") - assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32)) - finally: - s.close() - -# This takes the CPU path -if FFI: - try: # Create input array on CPU arr_cpu = np.zeros(1024, dtype=np.int32) print(f"before: {arr_cpu[:10]=}") # Run the workload - my_func(arr_cpu, None) + my_func(arr_cpu) # Check the result print(f"after: {arr_cpu[:10]=}") assert np.allclose(arr_cpu, np.arange(1024, dtype=np.int32)) finally: + sys.path = saved_sys_path # to allow FFI module to unload, we delete references to # to cpu_func del cpu_func, my_func # clean up temp directory shutil.rmtree(temp_dir) + + +if __name__ == "__main__": + run() diff --git a/cuda_core/examples/strided_memory_view_gpu.py b/cuda_core/examples/strided_memory_view_gpu.py new file mode 100644 index 0000000000..10d12fd30c --- /dev/null +++ b/cuda_core/examples/strided_memory_view_gpu.py @@ -0,0 +1,129 @@ +# Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED. +# +# SPDX-License-Identifier: Apache-2.0 + +# ################################################################################ +# +# This demo aims to illustrate two takeaways: +# +# 1. The similarity between CPU and GPU JIT-compilation with C++ sources +# 2. How to use StridedMemoryView to interface with foreign C/C++ functions +# +# To facilitate this demo, we use cffi (https://cffi.readthedocs.io/) for the CPU +# path, which can be easily installed from pip or conda following their instructions. +# We also use NumPy/CuPy as the CPU/GPU array container. +# +# ################################################################################ + +import string +import sys + +try: + import cupy as cp +except ImportError: + print("cupy is not installed, the GPU example will be skipped", file=sys.stderr) + cp = None +import numpy as np + +from cuda.core.experimental import Device, LaunchConfig, Program, ProgramOptions, launch +from cuda.core.experimental.utils import StridedMemoryView, args_viewable_as_strided_memory + +# ################################################################################ +# +# Usually this entire code block is in a separate file, built as a Python extension +# module that can be imported by users at run time. For illustrative purposes we +# use JIT compilation to make this demo self-contained. +# +# Here we assume an in-place operation, equivalent to the following NumPy code: +# +# >>> arr = ... +# >>> assert arr.dtype == np.int32 +# >>> assert arr.ndim == 1 +# >>> arr += np.arange(arr.size, dtype=arr.dtype) +# +# is implemented for both CPU and GPU at low-level, with the following C function +# signature: +func_name = "inplace_plus_arange_N" +func_sig = f"void {func_name}(int* data, size_t N)" + +# Now we are prepared to run the code from the user's perspective! +# +# ################################################################################ + + +# Below, as a user we want to perform the said in-place operation on either CPU +# or GPU, by calling the corresponding function implemented "elsewhere" (done above). + + +# We assume the 0-th argument supports either DLPack or CUDA Array Interface (both +# of which are supported by StridedMemoryView). +@args_viewable_as_strided_memory((0,)) +def my_func(arr, work_stream, gpu_ker): + # Create a memory view over arr (assumed to be a 1D array of int32). The stream + # ordering is taken care of, so that arr can be safely accessed on our work + # stream (ordered after a data stream on which arr is potentially prepared). + view = arr.view(work_stream.handle if work_stream else -1) + assert isinstance(view, StridedMemoryView) + assert len(view.shape) == 1 + assert view.dtype == np.int32 + assert view.is_device_accessible + + size = view.shape[0] + # DLPack also supports host arrays. We want to know if the array data is + # accessible from the GPU, and dispatch to the right routine accordingly. + block = 256 + grid = (size + block - 1) // block + config = LaunchConfig(grid=grid, block=block) + launch(work_stream, config, gpu_ker, view.ptr, np.uint64(size)) + # Here we're being conservative and synchronize over our work stream, + # assuming we do not know the data stream; if we know then we could + # just order the data stream after the work stream here, e.g. + # + # data_stream.wait(work_stream) + # + # without an expensive synchronization (with respect to the host). + work_stream.sync() + + +def run(): + global my_func + if not cp: + return None + # Here is a concrete (very naive!) implementation on GPU: + gpu_code = string.Template(r""" + extern "C" + __global__ $func_sig { + const size_t tid = threadIdx.x + blockIdx.x * blockDim.x; + const size_t stride_size = gridDim.x * blockDim.x; + for (size_t i = tid; i < N; i += stride_size) { + data[i] += i; + } + } + """).substitute(func_sig=func_sig) + + # To know the GPU's compute capability, we need to identify which GPU to use. + dev = Device(0) + dev.set_current() + arch = "".join(f"{i}" for i in dev.compute_capability) + gpu_prog = Program(gpu_code, code_type="c++", options=ProgramOptions(arch=f"sm_{arch}", std="c++11")) + mod = gpu_prog.compile(target_type="cubin") + gpu_ker = mod.get_kernel(func_name) + + s = dev.create_stream() + try: + # Create input array on GPU + arr_gpu = cp.ones(1024, dtype=cp.int32) + print(f"before: {arr_gpu[:10]=}") + + # Run the workload + my_func(arr_gpu, s, gpu_ker) + + # Check the result + print(f"after: {arr_gpu[:10]=}") + assert cp.allclose(arr_gpu, 1 + cp.arange(1024, dtype=cp.int32)) + finally: + s.close() + + +if __name__ == "__main__": + run()