diff --git a/cuda_bindings/cuda/bindings/driver.pyx.in b/cuda_bindings/cuda/bindings/driver.pyx.in index 70f83f4c93..1c2824a0da 100644 --- a/cuda_bindings/cuda/bindings/driver.pyx.in +++ b/cuda_bindings/cuda/bindings/driver.pyx.in @@ -44,6 +44,8 @@ ctypedef unsigned long long float_ptr ctypedef unsigned long long double_ptr ctypedef unsigned long long void_ptr +cdef dict _cu_mem_alloc_managed_concurrent_access_by_device = {} + #: CUDA API version number CUDA_VERSION = cydriver.CUDA_VERSION @@ -31341,6 +31343,39 @@ def cuMemAllocManaged(size_t bytesize, unsigned int flags): -------- :py:obj:`~.cuArray3DCreate`, :py:obj:`~.cuArray3DGetDescriptor`, :py:obj:`~.cuArrayCreate`, :py:obj:`~.cuArrayDestroy`, :py:obj:`~.cuArrayGetDescriptor`, :py:obj:`~.cuMemAllocHost`, :py:obj:`~.cuMemAllocPitch`, :py:obj:`~.cuMemcpy2D`, :py:obj:`~.cuMemcpy2DAsync`, :py:obj:`~.cuMemcpy2DUnaligned`, :py:obj:`~.cuMemcpy3D`, :py:obj:`~.cuMemcpy3DAsync`, :py:obj:`~.cuMemcpyAtoA`, :py:obj:`~.cuMemcpyAtoD`, :py:obj:`~.cuMemcpyAtoH`, :py:obj:`~.cuMemcpyAtoHAsync`, :py:obj:`~.cuMemcpyDtoA`, :py:obj:`~.cuMemcpyDtoD`, :py:obj:`~.cuMemcpyDtoDAsync`, :py:obj:`~.cuMemcpyDtoH`, :py:obj:`~.cuMemcpyDtoHAsync`, :py:obj:`~.cuMemcpyHtoA`, :py:obj:`~.cuMemcpyHtoAAsync`, :py:obj:`~.cuMemcpyHtoD`, :py:obj:`~.cuMemcpyHtoDAsync`, :py:obj:`~.cuMemFree`, :py:obj:`~.cuMemFreeHost`, :py:obj:`~.cuMemGetAddressRange`, :py:obj:`~.cuMemGetInfo`, :py:obj:`~.cuMemHostAlloc`, :py:obj:`~.cuMemHostGetDevicePointer`, :py:obj:`~.cuMemsetD2D8`, :py:obj:`~.cuMemsetD2D16`, :py:obj:`~.cuMemsetD2D32`, :py:obj:`~.cuMemsetD8`, :py:obj:`~.cuMemsetD16`, :py:obj:`~.cuMemsetD32`, :py:obj:`~.cuDeviceGetAttribute`, :py:obj:`~.cuStreamAttachMemAsync`, :py:obj:`~.cudaMallocManaged` """ + # WIP-WIP-WIP THIS CODE NEEDS TO BE PORTED TO THE CODE GENERATOR + cdef int concurrent_access = 0 + cdef int device_id = 0 + cdef cydriver.CUdevice device + err = cydriver.cuCtxGetDevice(&device) + if err != cydriver.CUDA_SUCCESS: + # cuMemAllocManaged would fail with the same error anyway. + return (_CUresult(err), None) + device_id = device + if device_id in _cu_mem_alloc_managed_concurrent_access_by_device: + if _cu_mem_alloc_managed_concurrent_access_by_device[device_id] == 0: + raise RuntimeError( + "cuMemAllocManaged is not supported when " + "CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS == 0" + ) + else: + err = cydriver.cuDeviceGetAttribute( + &concurrent_access, + cydriver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS, + device, + ) + if err != cydriver.CUDA_SUCCESS: + raise RuntimeError( + "cuDeviceGetAttribute(CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS) failed " + f"while validating cuMemAllocManaged: {_CUresult(err)}" + ) + _cu_mem_alloc_managed_concurrent_access_by_device[device_id] = concurrent_access + if concurrent_access == 0: + raise RuntimeError( + "cuMemAllocManaged is not supported when " + "CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS == 0" + ) + cdef CUdeviceptr dptr = CUdeviceptr() with nogil: err = cydriver.cuMemAllocManaged(dptr._pvt_ptr, bytesize, flags) diff --git a/cuda_bindings/tests/test_cuda.py b/cuda_bindings/tests/test_cuda.py index e3eefb1fdd..edde1e03fd 100644 --- a/cuda_bindings/tests/test_cuda.py +++ b/cuda_bindings/tests/test_cuda.py @@ -8,6 +8,7 @@ import numpy as np import pytest +from cuda_python_test_helpers.managed_memory import skip_if_concurrent_managed_access_disabled import cuda.bindings.driver as cuda import cuda.bindings.runtime as cudart @@ -325,6 +326,7 @@ def test_cuda_memPool_attr(): driverVersionLessThan(11030) or not supportsManagedMemory(), reason="When new attributes were introduced" ) def test_cuda_pointer_attr(): + skip_if_concurrent_managed_access_disabled() err, ptr = cuda.cuMemAllocManaged(0x1000, cuda.CUmemAttach_flags.CU_MEM_ATTACH_GLOBAL.value) assert err == cuda.CUresult.CUDA_SUCCESS @@ -390,6 +392,7 @@ def test_pointer_get_attributes_device_ordinal(): @pytest.mark.skipif(not supportsManagedMemory(), reason="When new attributes were introduced") def test_cuda_mem_range_attr(device): + skip_if_concurrent_managed_access_disabled() size = 0x1000 location_device = cuda.CUmemLocation() location_device.type = cuda.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py index 71d2f30573..07faf282fa 100644 --- a/cuda_core/tests/conftest.py +++ b/cuda_core/tests/conftest.py @@ -51,7 +51,14 @@ def skip_if_pinned_memory_unsupported(device): pytest.skip("PinnedMemoryResource requires CUDA 13.0 or later") +def _skip_if_concurrent_managed_access_disabled(device=None) -> None: + from cuda_python_test_helpers.managed_memory import skip_if_concurrent_managed_access_disabled + + skip_if_concurrent_managed_access_disabled(device) + + def skip_if_managed_memory_unsupported(device): + _skip_if_concurrent_managed_access_disabled(device) try: if not device.properties.memory_pools_supported or not device.properties.concurrent_managed_access: pytest.skip("Device does not support managed memory pool operations") @@ -74,6 +81,11 @@ def create_managed_memory_resource_or_skip(*args, **kwargs): raise +@pytest.fixture +def requires_concurrent_managed_access(): + _skip_if_concurrent_managed_access_disabled() + + @pytest.fixture(scope="session", autouse=True) def session_setup(): # Always init CUDA. diff --git a/cuda_core/tests/graph/test_capture_alloc.py b/cuda_core/tests/graph/test_capture_alloc.py index 5cb23fd022..c83e7abb1e 100644 --- a/cuda_core/tests/graph/test_capture_alloc.py +++ b/cuda_core/tests/graph/test_capture_alloc.py @@ -78,6 +78,7 @@ def free(self, buffers): self.stream.sync() +@pytest.mark.usefixtures("requires_concurrent_managed_access") @pytest.mark.parametrize("mode", ["no_graph", "global", "thread_local", "relaxed"]) @pytest.mark.parametrize("action", ["incr", "fill"]) def test_graph_alloc(mempool_device, mode, action): @@ -146,6 +147,7 @@ def apply_kernels(mr, stream, out): assert compare_buffer_to_constant(out, 6) +@pytest.mark.usefixtures("requires_concurrent_managed_access") @pytest.mark.skipif(IS_WINDOWS or IS_WSL, reason="auto_free_on_launch not supported on Windows") @pytest.mark.parametrize("mode", ["global", "thread_local", "relaxed"]) def test_graph_alloc_with_output(mempool_device, mode): diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py index 793ae0744c..ef21329b02 100644 --- a/cuda_core/tests/memory_ipc/test_event_ipc.py +++ b/cuda_core/tests/memory_ipc/test_event_ipc.py @@ -18,6 +18,8 @@ class TestEventIpc: """Check the basic usage of IPC-enabled events with a latch kernel.""" + pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access") + @pytest.mark.flaky(reruns=2) def test_main(self, ipc_device, ipc_memory_resource): log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING) diff --git a/cuda_core/tests/memory_ipc/test_memory_ipc.py b/cuda_core/tests/memory_ipc/test_memory_ipc.py index 1b320fa6f2..0e09156185 100644 --- a/cuda_core/tests/memory_ipc/test_memory_ipc.py +++ b/cuda_core/tests/memory_ipc/test_memory_ipc.py @@ -13,6 +13,8 @@ NWORKERS = 2 NTASKS = 2 +pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access") + class TestIpcMempool: @pytest.mark.flaky(reruns=2) diff --git a/cuda_core/tests/memory_ipc/test_peer_access.py b/cuda_core/tests/memory_ipc/test_peer_access.py index a3f8398670..cca056a78c 100644 --- a/cuda_core/tests/memory_ipc/test_peer_access.py +++ b/cuda_core/tests/memory_ipc/test_peer_access.py @@ -59,6 +59,8 @@ class TestBufferPeerAccessAfterImport: setting peer access on the imported memory resource, and that access can be revoked. """ + pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access") + @pytest.mark.flaky(reruns=2) @pytest.mark.parametrize("grant_access_in_parent", [True, False]) def test_main(self, mempool_device_x2, grant_access_in_parent): diff --git a/cuda_core/tests/memory_ipc/test_send_buffers.py b/cuda_core/tests/memory_ipc/test_send_buffers.py index db107ebff4..ede6dfd7df 100644 --- a/cuda_core/tests/memory_ipc/test_send_buffers.py +++ b/cuda_core/tests/memory_ipc/test_send_buffers.py @@ -15,6 +15,8 @@ NTASKS = 7 POOL_SIZE = 2097152 +pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access") + class TestIpcSendBuffers: @pytest.mark.flaky(reruns=2) diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py index bd6a880fdc..6f1978c60d 100644 --- a/cuda_core/tests/memory_ipc/test_serialize.py +++ b/cuda_core/tests/memory_ipc/test_serialize.py @@ -14,6 +14,8 @@ NBYTES = 64 POOL_SIZE = 2097152 +pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access") + class TestObjectSerializationDirect: """ diff --git a/cuda_core/tests/memory_ipc/test_workerpool.py b/cuda_core/tests/memory_ipc/test_workerpool.py index ed66c95d33..89c137f0c2 100644 --- a/cuda_core/tests/memory_ipc/test_workerpool.py +++ b/cuda_core/tests/memory_ipc/test_workerpool.py @@ -17,6 +17,8 @@ NTASKS = 20 POOL_SIZE = 2097152 +pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access") + class TestIpcWorkerPool: """ diff --git a/cuda_core/tests/test_helpers.py b/cuda_core/tests/test_helpers.py index bd13ed5067..2dbb88be6e 100644 --- a/cuda_core/tests/test_helpers.py +++ b/cuda_core/tests/test_helpers.py @@ -16,6 +16,8 @@ ENABLE_LOGGING = False # Set True for test debugging and development NBYTES = 64 +pytestmark = pytest.mark.usefixtures("requires_concurrent_managed_access") + def test_latchkernel(): """Test LatchKernel.""" diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py index 0473d2d183..489cb131c7 100644 --- a/cuda_core/tests/test_memory.py +++ b/cuda_core/tests/test_memory.py @@ -164,13 +164,26 @@ def buffer_initialization(dummy_mr: MemoryResource): buffer.close() -def test_buffer_initialization(): +@pytest.mark.parametrize( + ("mr_factory", "needs_device"), + [ + (DummyDeviceMemoryResource, True), + (DummyHostMemoryResource, False), + (DummyUnifiedMemoryResource, True), + (DummyPinnedMemoryResource, True), + ], + ids=["device", "host", "unified", "pinned"], +) +def test_buffer_initialization(mr_factory, needs_device, request): device = Device() device.set_current() - buffer_initialization(DummyDeviceMemoryResource(device)) - buffer_initialization(DummyHostMemoryResource()) - buffer_initialization(DummyUnifiedMemoryResource(device)) - buffer_initialization(DummyPinnedMemoryResource(device)) + if mr_factory is DummyUnifiedMemoryResource: + request.getfixturevalue("requires_concurrent_managed_access") + mr = mr_factory(device) if needs_device else mr_factory() + buffer_initialization(mr) + + +def test_buffer_initialization_invalid_mr(): with pytest.raises(TypeError): buffer_initialization(MemoryResource()) @@ -198,12 +211,22 @@ def buffer_copy_to(dummy_mr: MemoryResource, device: Device, check=False): src_buffer.close() -def test_buffer_copy_to(): +@pytest.mark.parametrize( + ("mr_factory", "check"), + [ + (DummyDeviceMemoryResource, False), + (DummyUnifiedMemoryResource, False), + (DummyPinnedMemoryResource, True), + ], + ids=["device", "unified", "pinned"], +) +def test_buffer_copy_to(mr_factory, check, request): device = Device() device.set_current() - buffer_copy_to(DummyDeviceMemoryResource(device), device) - buffer_copy_to(DummyUnifiedMemoryResource(device), device) - buffer_copy_to(DummyPinnedMemoryResource(device), device, check=True) + if mr_factory is DummyUnifiedMemoryResource: + request.getfixturevalue("requires_concurrent_managed_access") + mr = mr_factory(device) + buffer_copy_to(mr, device, check=check) def buffer_copy_from(dummy_mr: MemoryResource, device, check=False): @@ -229,12 +252,22 @@ def buffer_copy_from(dummy_mr: MemoryResource, device, check=False): src_buffer.close() -def test_buffer_copy_from(): +@pytest.mark.parametrize( + ("mr_factory", "check"), + [ + (DummyDeviceMemoryResource, False), + (DummyUnifiedMemoryResource, False), + (DummyPinnedMemoryResource, True), + ], + ids=["device", "unified", "pinned"], +) +def test_buffer_copy_from(mr_factory, check, request): device = Device() device.set_current() - buffer_copy_from(DummyDeviceMemoryResource(device), device) - buffer_copy_from(DummyUnifiedMemoryResource(device), device) - buffer_copy_from(DummyPinnedMemoryResource(device), device, check=True) + if mr_factory is DummyUnifiedMemoryResource: + request.getfixturevalue("requires_concurrent_managed_access") + mr = mr_factory(device) + buffer_copy_from(mr, device, check=check) def _bytes_repeat(pattern: bytes, size: int) -> bytes: @@ -256,6 +289,7 @@ def fill_env(request): if request.param == "device": mr = DummyDeviceMemoryResource(device) elif request.param == "unified": + request.getfixturevalue("requires_concurrent_managed_access") mr = DummyUnifiedMemoryResource(device) else: mr = DummyPinnedMemoryResource(device) @@ -345,13 +379,23 @@ def buffer_close(dummy_mr: MemoryResource): assert buffer.memory_resource is None -def test_buffer_close(): +@pytest.mark.parametrize( + ("mr_factory", "needs_device"), + [ + (DummyDeviceMemoryResource, True), + (DummyHostMemoryResource, False), + (DummyUnifiedMemoryResource, True), + (DummyPinnedMemoryResource, True), + ], + ids=["device", "host", "unified", "pinned"], +) +def test_buffer_close(mr_factory, needs_device, request): device = Device() device.set_current() - buffer_close(DummyDeviceMemoryResource(device)) - buffer_close(DummyHostMemoryResource()) - buffer_close(DummyUnifiedMemoryResource(device)) - buffer_close(DummyPinnedMemoryResource(device)) + if mr_factory is DummyUnifiedMemoryResource: + request.getfixturevalue("requires_concurrent_managed_access") + mr = mr_factory(device) if needs_device else mr_factory() + buffer_close(mr) def test_buffer_external_host(): @@ -447,7 +491,7 @@ def test_buffer_external_pinned_registered(change_device): @pytest.mark.parametrize("change_device", [True, False]) -def test_buffer_external_managed(change_device): +def test_buffer_external_managed(change_device, requires_concurrent_managed_access): n = ccx_system.get_num_devices() if n < 1: pytest.skip("No devices found") @@ -560,9 +604,11 @@ def test_buffer_dunder_dlpack(): (DummyPinnedMemoryResource, (DLDeviceType.kDLCUDAHost, 0)), ], ) -def test_buffer_dunder_dlpack_device_success(DummyMR, expected): +def test_buffer_dunder_dlpack_device_success(DummyMR, expected, request): device = Device() device.set_current() + if DummyMR is DummyUnifiedMemoryResource: + request.getfixturevalue("requires_concurrent_managed_access") dummy_mr = DummyMR() if DummyMR is DummyHostMemoryResource else DummyMR(device) buffer = dummy_mr.allocate(size=1024) assert buffer.__dlpack_device__() == expected diff --git a/cuda_core/tests/test_memory_peer_access.py b/cuda_core/tests/test_memory_peer_access.py index b7d5747b75..8ef26faac9 100644 --- a/cuda_core/tests/test_memory_peer_access.py +++ b/cuda_core/tests/test_memory_peer_access.py @@ -11,6 +11,7 @@ NBYTES = 1024 +@pytest.mark.usefixtures("requires_concurrent_managed_access") def test_peer_access_basic(mempool_device_x2): """Basic tests for dmr.peer_accessible_by.""" dev0, dev1 = mempool_device_x2 @@ -81,6 +82,7 @@ def check(expected): dmr.peer_accessible_by = [num_devices] # device ID out of bounds +@pytest.mark.usefixtures("requires_concurrent_managed_access") def test_peer_access_transitions(mempool_device_x3): """Advanced tests for dmr.peer_accessible_by.""" diff --git a/cuda_python_test_helpers/cuda_python_test_helpers/managed_memory.py b/cuda_python_test_helpers/cuda_python_test_helpers/managed_memory.py new file mode 100644 index 0000000000..7d425c5256 --- /dev/null +++ b/cuda_python_test_helpers/cuda_python_test_helpers/managed_memory.py @@ -0,0 +1,68 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +from functools import cache + +import pytest + +try: + from cuda.bindings import driver # type: ignore +except Exception: + from cuda import cuda as driver # type: ignore + + +def _resolve_device_id(device) -> int: + if device is None: + return 0 + if hasattr(device, "device_id"): + return int(device.device_id) + try: + return int(device) + except Exception: + return 0 + + +def _cu_init_ok() -> bool: + try: + (err,) = driver.cuInit(0) + except Exception: + return False + return err == driver.CUresult.CUDA_SUCCESS + + +@cache +def _get_concurrent_managed_access(device_id: int) -> int | None: + if not _cu_init_ok(): + return None + try: + attr = driver.CUdevice_attribute.CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS + except Exception: + return None + try: + err, value = driver.cuDeviceGetAttribute(attr, device_id) + except Exception: + return None + if err != driver.CUresult.CUDA_SUCCESS: + return None + return int(value) + + +def managed_memory_skip_reason(device=None) -> str | None: + """Return a skip reason when managed memory should be avoided.""" + # Negative test: disable skipping regardless of concurrent managed access. + return None + device_id = _resolve_device_id(device) + value = _get_concurrent_managed_access(device_id) + if value is None: + return "Unable to query CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS" + if value == 0: + return "CU_DEVICE_ATTRIBUTE_CONCURRENT_MANAGED_ACCESS == 0" + return None + + +def skip_if_concurrent_managed_access_disabled(device=None) -> None: + reason = managed_memory_skip_reason(device) + if reason: + pytest.skip(reason)