From 01c2c0e3141a22c45afd571f24a5ed327be2c017 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 11 Dec 2025 00:54:26 +0000
Subject: [PATCH 01/14] refactor to collect mempool impl

---
 .../core/experimental/_memory/__init__.pxd    |   0
 .../_memory/_device_memory_resource.pxd       |  14 +-
 .../_memory/_device_memory_resource.pyx       | 410 ++----------------
 .../cuda/core/experimental/_memory/_ipc.pxd   |  18 +-
 .../cuda/core/experimental/_memory/_ipc.pyx   |  33 +-
 .../experimental/_memory/_memory_pool.pxd     |  27 ++
 .../experimental/_memory/_memory_pool.pyx     | 404 +++++++++++++++++
 cuda_core/tests/test_memory.py                |   6 +-
 8 files changed, 491 insertions(+), 421 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_memory/__init__.pxd
 create mode 100644 cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd
 create mode 100644 cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx

diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.pxd b/cuda_core/cuda/core/experimental/_memory/__init__.pxd
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
index 823a270b27..17ee12e54f 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pxd
@@ -2,20 +2,12 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport MemoryResource
+from cuda.core.experimental._memory._memory_pool cimport _MemPool
 from cuda.core.experimental._memory._ipc cimport IPCDataForMR
 
 
-cdef class DeviceMemoryResource(MemoryResource):
-    cdef:
-        int                   _dev_id
-        cydriver.CUmemoryPool _handle
-        bint                  _mempool_owned
-        IPCDataForMR          _ipc_data
-        object                _attributes
-        object                _peer_accessible_by
-        object                __weakref__
+cdef class DeviceMemoryResource(_MemPool):
+    pass
 
 
 cpdef DMR_mempool_get_access(DeviceMemoryResource, int)
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index ac18079a62..03389dbd6a 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -4,31 +4,24 @@
 
 from __future__ import annotations
 
-from libc.limits cimport ULLONG_MAX
-from libc.stdint cimport uintptr_t
-from libc.stdlib cimport malloc, free
-from libc.string cimport memset
-
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
+from cuda.core.experimental._memory._memory_pool cimport _MemPool, _MemPoolOptions
 from cuda.core.experimental._memory cimport _ipc
-from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle, IPCDataForMR
-from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream
+from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle
 from cuda.core.experimental._utils.cuda_utils cimport (
     check_or_create_options,
     HANDLE_RETURN,
 )
 
 from dataclasses import dataclass
+import multiprocessing
 from typing import Optional, TYPE_CHECKING
 import platform  # no-cython-lint
 import uuid
-import weakref
 
-from cuda.core.experimental._utils.cuda_utils import driver
+from cuda.core.experimental._utils.cuda_utils import check_multiprocessing_start_method
 
 if TYPE_CHECKING:
-    from cuda.core.experimental._memory.buffer import DevicePointerT
     from .._device import Device
 
 __all__ = ['DeviceMemoryResource', 'DeviceMemoryResourceOptions']
@@ -53,92 +46,7 @@ cdef class DeviceMemoryResourceOptions:
     max_size : int = 0
 
 
-cdef class DeviceMemoryResourceAttributes:
-    cdef:
-        object _mr_weakref
-
-    def __init__(self, *args, **kwargs):
-        raise RuntimeError("DeviceMemoryResourceAttributes cannot be instantiated directly. Please use MemoryResource APIs.")
-
-    @classmethod
-    def _init(cls, mr):
-        cdef DeviceMemoryResourceAttributes self = DeviceMemoryResourceAttributes.__new__(cls)
-        self._mr_weakref = mr
-        return self
-
-    def __repr__(self):
-        return f"{self.__class__.__name__}(%s)" % ", ".join(
-            f"{attr}={getattr(self, attr)}" for attr in dir(self)
-                                            if not attr.startswith("_")
-        )
-
-    cdef int _getattribute(self, cydriver.CUmemPool_attribute attr_enum, void* value) except?-1:
-        cdef DeviceMemoryResource mr = <DeviceMemoryResource>(self._mr_weakref())
-        if mr is None:
-            raise RuntimeError("DeviceMemoryResource is expired")
-        cdef cydriver.CUmemoryPool pool_handle = mr._handle
-        with nogil:
-            HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(pool_handle, attr_enum, value))
-        return 0
-
-    @property
-    def reuse_follow_event_dependencies(self):
-        """Allow memory to be reused when there are event dependencies between streams."""
-        cdef int value
-        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES, &value)
-        return bool(value)
-
-    @property
-    def reuse_allow_opportunistic(self):
-        """Allow reuse of completed frees without dependencies."""
-        cdef int value
-        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, &value)
-        return bool(value)
-
-    @property
-    def reuse_allow_internal_dependencies(self):
-        """Allow insertion of new stream dependencies for memory reuse."""
-        cdef int value
-        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, &value)
-        return bool(value)
-
-    @property
-    def release_threshold(self):
-        """Amount of reserved memory to hold before OS release."""
-        cdef cydriver.cuuint64_t value
-        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &value)
-        return int(value)
-
-    @property
-    def reserved_mem_current(self):
-        """Current amount of backing memory allocated."""
-        cdef cydriver.cuuint64_t value
-        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, &value)
-        return int(value)
-
-    @property
-    def reserved_mem_high(self):
-        """High watermark of backing memory allocated."""
-        cdef cydriver.cuuint64_t value
-        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, &value)
-        return int(value)
-
-    @property
-    def used_mem_current(self):
-        """Current amount of memory in use."""
-        cdef cydriver.cuuint64_t value
-        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_CURRENT, &value)
-        return int(value)
-
-    @property
-    def used_mem_high(self):
-        """High watermark of memory in use."""
-        cdef cydriver.cuuint64_t value
-        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_HIGH, &value)
-        return int(value)
-
-
-cdef class DeviceMemoryResource(MemoryResource):
+cdef class DeviceMemoryResource(_MemPool):
     """
     A device memory resource managing a stream-ordered memory pool.
 
@@ -217,36 +125,26 @@ cdef class DeviceMemoryResource(MemoryResource):
     associated MMR.
     """
 
-    def __cinit__(self):
-        self._dev_id = cydriver.CU_DEVICE_INVALID
-        self._handle = NULL
-        self._mempool_owned = False
-        self._ipc_data = None
-        self._attributes = None
-        self._peer_accessible_by = ()
-
     def __init__(self, device_id: Device | int, options=None):
         from .._device import Device
         cdef int dev_id = Device(device_id).device_id
-        opts = check_or_create_options(
+        cdef DeviceMemoryResourceOptions opts = check_or_create_options(
             DeviceMemoryResourceOptions, options, "DeviceMemoryResource options",
             keep_none=True
         )
+        cdef _MemPoolOptions opts_base = _MemPoolOptions()
 
-        if opts is None:
-            DMR_init_current(self, dev_id)
-        else:
-            DMR_init_create(self, dev_id, opts)
+        cdef bint ipc_enabled = False
+        if opts:
+            ipc_enabled = opts.ipc_enabled
+            if ipc_enabled and not _ipc.is_supported():
+                raise RuntimeError("IPC is not available on {platform.system()}")
+            opts_base._max_size = opts.max_size
+        opts_base._ipc_enabled = ipc_enabled
+        opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+        opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
 
-    def __dealloc__(self):
-        DMR_close(self)
-
-    def close(self):
-        """
-        Close the device memory resource and destroy the associated memory pool
-        if owned.
-        """
-        DMR_close(self)
+        super().__init__(dev_id, opts_base)
 
     def __reduce__(self):
         return DeviceMemoryResource.from_registry, (self.uuid,)
@@ -261,7 +159,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         RuntimeError
             If no mapped memory resource is found in the registry.
         """
-        return _ipc.DMR_from_registry(uuid)
+        return <DeviceMemoryResource>(_ipc.MP_from_registry(uuid))
 
     def register(self, uuid: uuid.UUID) -> DeviceMemoryResource:  # no-cython-lint
         """
@@ -272,7 +170,7 @@ cdef class DeviceMemoryResource(MemoryResource):
         The registered mapped memory resource. If one was previously registered
         with the given key, it is returned.
         """
-        return _ipc.DMR_register(self, uuid)
+        return <DeviceMemoryResource>(_ipc.MP_register(self, uuid))
 
     @classmethod
     def from_allocation_handle(
@@ -299,7 +197,11 @@ cdef class DeviceMemoryResource(MemoryResource):
         -------
             A new device memory resource instance with the imported handle.
         """
-        return _ipc.DMR_from_allocation_handle(cls, device_id, alloc_handle)
+        cdef DeviceMemoryResource mr = <DeviceMemoryResource>(
+            _ipc.MP_from_allocation_handle(cls, alloc_handle))
+        from .._device import Device
+        mr._dev_id = Device(device_id).device_id
+        return mr
 
     def get_allocation_handle(self) -> IPCAllocationHandle:
         """Export the memory pool handle to be shared (requires IPC).
@@ -315,73 +217,11 @@ cdef class DeviceMemoryResource(MemoryResource):
             raise RuntimeError("Memory resource is not IPC-enabled")
         return self._ipc_data._alloc_handle
 
-    def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
-        """Allocate a buffer of the requested size.
-
-        Parameters
-        ----------
-        size : int
-            The size of the buffer to allocate, in bytes.
-        stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
-            The stream on which to perform the allocation asynchronously.
-            If None, an internal stream is used.
-
-        Returns
-        -------
-        Buffer
-            The allocated buffer object, which is accessible on the device that this memory
-            resource was created for.
-        """
-        if self.is_mapped:
-            raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
-        stream = Stream_accept(stream) if stream is not None else default_stream()
-        return DMR_allocate(self, size, <Stream> stream)
-
-    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None):
-        """Deallocate a buffer previously allocated by this resource.
-
-        Parameters
-        ----------
-        ptr : :obj:`~_memory.DevicePointerT`
-            The pointer or handle to the buffer to deallocate.
-        size : int
-            The size of the buffer to deallocate, in bytes.
-        stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
-            The stream on which to perform the deallocation asynchronously.
-            If the buffer is deallocated without an explicit stream, the allocation stream
-            is used.
-        """
-        stream = Stream_accept(stream) if stream is not None else default_stream()
-        DMR_deallocate(self, <uintptr_t>ptr, size, <Stream> stream)
-
-    @property
-    def attributes(self) -> DeviceMemoryResourceAttributes:
-        """Memory pool attributes."""
-        if self._attributes is None:
-            ref = weakref.ref(self)
-            self._attributes = DeviceMemoryResourceAttributes._init(ref)
-        return self._attributes
-
-    @property
-    def device_id(self) -> int:
-        """The associated device ordinal."""
-        return self._dev_id
-
-    @property
-    def handle(self) -> driver.CUmemoryPool:
-        """Handle to the underlying memory pool."""
-        return driver.CUmemoryPool(<uintptr_t>(self._handle))
-
     @property
     def is_device_accessible(self) -> bool:
         """Return True. This memory resource provides device-accessible buffers."""
         return True
 
-    @property
-    def is_handle_owned(self) -> bool:
-        """Whether the memory resource handle is owned. If False, ``close`` has no effect."""
-        return self._mempool_owned
-
     @property
     def is_host_accessible(self) -> bool:
         """Return False. This memory resource does not provide host-accessible buffers."""
@@ -408,197 +248,6 @@ cdef class DeviceMemoryResource(MemoryResource):
         """
         return getattr(self._ipc_data, 'uuid', None)
 
-    @property
-    def peer_accessible_by(self):
-        """
-        Get or set the devices that can access allocations from this memory
-        pool. Access can be modified at any time and affects all allocations
-        from this memory pool.
-
-        Returns a tuple of sorted device IDs that currently have peer access to
-        allocations from this memory pool.
-
-        When setting, accepts a sequence of Device objects or device IDs.
-        Setting to an empty sequence revokes all peer access.
-
-        Examples
-        --------
-        >>> dmr = DeviceMemoryResource(0)
-        >>> dmr.peer_accessible_by = [1]  # Grant access to device 1
-        >>> assert dmr.peer_accessible_by == (1,)
-        >>> dmr.peer_accessible_by = []  # Revoke access
-        """
-        return self._peer_accessible_by
-
-    @peer_accessible_by.setter
-    def peer_accessible_by(self, devices):
-        """Set which devices can access this memory pool."""
-        from .._device import Device
-
-        # Convert all devices to device IDs
-        cdef set[int] target_ids = {Device(dev).device_id for dev in devices}
-        target_ids.discard(self._dev_id)  # exclude this device from peer access list
-        this_dev = Device(self._dev_id)
-        cdef list bad = [dev for dev in target_ids if not this_dev.can_access_peer(dev)]
-        if bad:
-            raise ValueError(f"Device {self._dev_id} cannot access peer(s): {', '.join(map(str, bad))}")
-        cdef set[int] cur_ids = set(self._peer_accessible_by)
-        cdef set[int] to_add = target_ids - cur_ids
-        cdef set[int] to_rm = cur_ids - target_ids
-        cdef size_t count = len(to_add) + len(to_rm) # transaction size
-        cdef cydriver.CUmemAccessDesc* access_desc = NULL
-        cdef size_t i = 0
-
-        if count > 0:
-            access_desc = <cydriver.CUmemAccessDesc*>malloc(count * sizeof(cydriver.CUmemAccessDesc))
-            if access_desc == NULL:
-                raise MemoryError("Failed to allocate memory for access descriptors")
-
-            try:
-                for dev_id in to_add:
-                    access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
-                    access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-                    access_desc[i].location.id = dev_id
-                    i += 1
-
-                for dev_id in to_rm:
-                    access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE
-                    access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-                    access_desc[i].location.id = dev_id
-                    i += 1
-
-                with nogil:
-                    HANDLE_RETURN(cydriver.cuMemPoolSetAccess(self._handle, access_desc, count))
-            finally:
-                if access_desc != NULL:
-                    free(access_desc)
-
-            self._peer_accessible_by = tuple(target_ids)
-
-
-# DeviceMemoryResource Implementation
-# -----------------------------------
-
-cdef void DMR_init_current(DeviceMemoryResource self, int dev_id):
-    # Get the current memory pool.
-    cdef cydriver.cuuint64_t current_threshold
-    cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX
-
-    self._dev_id = dev_id
-    self._mempool_owned = False
-
-    with nogil:
-        HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), dev_id))
-
-        # Set a higher release threshold to improve performance when there are
-        # no active allocations.  By default, the release threshold is 0, which
-        # means memory is immediately released back to the OS when there are no
-        # active suballocations, causing performance issues.
-        HANDLE_RETURN(
-            cydriver.cuMemPoolGetAttribute(
-                self._handle,
-                cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
-                &current_threshold
-            )
-        )
-
-        # If threshold is 0 (default), set it to maximum to retain memory in the pool.
-        if current_threshold == 0:
-            HANDLE_RETURN(cydriver.cuMemPoolSetAttribute(
-                self._handle,
-                cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
-                &max_threshold
-            ))
-
-
-cdef void DMR_init_create(
-    DeviceMemoryResource self, int dev_id, DeviceMemoryResourceOptions opts
-):
-    # Create a new memory pool.
-    cdef cydriver.CUmemPoolProps properties
-
-    if opts.ipc_enabled and not _ipc.is_supported():
-        raise RuntimeError("IPC is not available on {platform.system()}")
-
-    memset(&properties, 0, sizeof(cydriver.CUmemPoolProps))
-    properties.allocType = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
-    properties.handleTypes = _ipc.IPC_HANDLE_TYPE if opts.ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
-    properties.location.id = dev_id
-    properties.location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-    properties.maxSize = opts.max_size
-    properties.win32SecurityAttributes = NULL
-    properties.usage = 0
-
-    self._dev_id = dev_id
-    self._mempool_owned = True
-
-    with nogil:
-        HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._handle), &properties))
-        # TODO: should we also set the threshold here?
-
-    if opts.ipc_enabled:
-        alloc_handle = _ipc.DMR_export_mempool(self)
-        self._ipc_data = IPCDataForMR(alloc_handle, False)
-
-
-# Raise an exception if the given stream is capturing.
-# A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error.
-cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil:
-    cdef cydriver.CUstreamCaptureStatus capturing
-    HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing))
-    if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_NONE:
-        raise RuntimeError("DeviceMemoryResource cannot perform memory operations on "
-                           "a capturing stream (consider using GraphMemoryResource).")
-
-
-cdef inline Buffer DMR_allocate(DeviceMemoryResource self, size_t size, Stream stream):
-    cdef cydriver.CUstream s = stream._handle
-    cdef cydriver.CUdeviceptr devptr
-    with nogil:
-        check_not_capturing(s)
-        HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
-    cdef Buffer buf = Buffer.__new__(Buffer)
-    buf._ptr = <uintptr_t>(devptr)
-    buf._ptr_obj = None
-    buf._size = size
-    buf._memory_resource = self
-    buf._alloc_stream = stream
-    return buf
-
-
-cdef inline void DMR_deallocate(
-    DeviceMemoryResource self, uintptr_t ptr, size_t size, Stream stream
-) noexcept:
-    cdef cydriver.CUstream s = stream._handle
-    cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
-    cdef cydriver.CUresult r
-    with nogil:
-        r = cydriver.cuMemFreeAsync(devptr, s)
-        if r != cydriver.CUDA_ERROR_INVALID_CONTEXT:
-            HANDLE_RETURN(r)
-
-
-cdef inline DMR_close(DeviceMemoryResource self):
-    if self._handle == NULL:
-        return
-
-    # This works around nvbug 5698116. When a memory pool handle is recycled
-    # the new handle inherits the peer access state of the previous handle.
-    if self._peer_accessible_by:
-        self.peer_accessible_by = []
-
-    try:
-        if self._mempool_owned:
-            with nogil:
-                HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._handle))
-    finally:
-        self._dev_id = cydriver.CU_DEVICE_INVALID
-        self._handle = NULL
-        self._attributes = None
-        self._mempool_owned = False
-        self._ipc_data = None
-        self._peer_accessible_by = ()
-
 
 # Note: this is referenced in instructions to debug nvbug 5698116.
 cpdef DMR_mempool_get_access(DeviceMemoryResource dmr, int device_id):
@@ -633,3 +282,14 @@ cpdef DMR_mempool_get_access(DeviceMemoryResource dmr, int device_id):
         return "r"
     else:
         return ""
+
+
+def _deep_reduce_device_memory_resource(mr):
+    check_multiprocessing_start_method()
+    from .._device import Device
+    device = Device(mr.device_id)
+    alloc_handle = mr.get_allocation_handle()
+    return mr.from_allocation_handle, (device, alloc_handle)
+
+
+multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource)
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
index 60d96a3b33..3fed2b7188 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pxd
@@ -4,10 +4,10 @@
 
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._buffer cimport Buffer
-from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
+from cuda.core.experimental._memory._memory_pool cimport _MemPool
 
 
-# Holds DeviceMemoryResource objects imported by this process.  This enables
+# Holds _MemPool objects imported by this process.  This enables
 # buffer serialization, as buffers can reduce to a pair comprising the memory
 # resource UUID (the key into this registry) and the serialized buffer
 # descriptor.
@@ -53,12 +53,12 @@ cdef class IPCAllocationHandle:
 # Buffer IPC Implementation
 # -------------------------
 cdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer)
-cdef Buffer Buffer_from_ipc_descriptor(cls, DeviceMemoryResource, IPCBufferDescriptor, stream)
+cdef Buffer Buffer_from_ipc_descriptor(cls, _MemPool, IPCBufferDescriptor, stream)
 
 
-# DeviceMemoryResource IPC Implementation
-# ---------------------------------------
-cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle)
-cdef DeviceMemoryResource DMR_from_registry(uuid)
-cdef DeviceMemoryResource DMR_register(DeviceMemoryResource, uuid)
-cdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource)
+# _MemPool IPC Implementation
+# ---------------------------
+cdef _MemPool MP_from_allocation_handle(cls, alloc_handle)
+cdef _MemPool MP_from_registry(uuid)
+cdef _MemPool MP_register(_MemPool, uuid)
+cdef IPCAllocationHandle MP_export_mempool(_MemPool)
diff --git a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
index c9931855cf..980e814e11 100644
--- a/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_ipc.pyx
@@ -142,17 +142,6 @@ def _reconstruct_allocation_handle(cls, df, uuid):  # no-cython-lint
 multiprocessing.reduction.register(IPCAllocationHandle, _reduce_allocation_handle)
 
 
-def _deep_reduce_device_memory_resource(mr):
-    check_multiprocessing_start_method()
-    from .._device import Device
-    device = Device(mr.device_id)
-    alloc_handle = mr.get_allocation_handle()
-    return mr.from_allocation_handle, (device, alloc_handle)
-
-
-multiprocessing.reduction.register(DeviceMemoryResource, _deep_reduce_device_memory_resource)
-
-
 # Buffer IPC Implementation
 # -------------------------
 cdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self):
@@ -169,13 +158,13 @@ cdef IPCBufferDescriptor Buffer_get_ipc_descriptor(Buffer self):
     return IPCBufferDescriptor._init(data_b, self.size)
 
 cdef Buffer Buffer_from_ipc_descriptor(
-    cls, DeviceMemoryResource mr, IPCBufferDescriptor ipc_descriptor, stream
+    cls, _MemPool mr, IPCBufferDescriptor ipc_descriptor, stream
 ):
     """Import a buffer that was exported from another process."""
     if not mr.is_ipc_enabled:
         raise RuntimeError("Memory resource is not IPC-enabled")
     if stream is None:
-        # Note: match this behavior to DeviceMemoryResource.allocate()
+        # Note: match this behavior to _MemPool.allocate()
         stream = default_stream()
     cdef cydriver.CUmemPoolPtrExportData data
     memcpy(
@@ -189,10 +178,10 @@ cdef Buffer Buffer_from_ipc_descriptor(
     return Buffer._init(<uintptr_t>ptr, ipc_descriptor.size, mr, stream, ipc_descriptor)
 
 
-# DeviceMemoryResource IPC Implementation
-# ---------------------------------------
+# _MemPool IPC Implementation
+# ---------------------------
 
-cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handle):
+cdef _MemPool MP_from_allocation_handle(cls, alloc_handle):
     # Quick exit for registry hits.
     uuid = getattr(alloc_handle, 'uuid', None)  # no-cython-lint
     mr = registry.get(uuid)
@@ -209,10 +198,8 @@ cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handl
             os.close(fd)
             raise
 
-    # Construct a new DMR.
-    cdef DeviceMemoryResource self = DeviceMemoryResource.__new__(cls)
-    from .._device import Device
-    self._dev_id = Device(device_id).device_id
+    # Construct a new mempool
+    cdef _MemPool self = <_MemPool>(cls.__new__(cls))
     self._mempool_owned = True
     self._ipc_data = IPCDataForMR(alloc_handle, True)
 
@@ -231,14 +218,14 @@ cdef DeviceMemoryResource DMR_from_allocation_handle(cls, device_id, alloc_handl
     return self
 
 
-cdef DeviceMemoryResource DMR_from_registry(uuid):
+cdef _MemPool MP_from_registry(uuid):
     try:
         return registry[uuid]
     except KeyError:
         raise RuntimeError(f"Memory resource {uuid} was not found") from None
 
 
-cdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid):
+cdef _MemPool MP_register(_MemPool self, uuid):
     existing = registry.get(uuid)
     if existing is not None:
         return existing
@@ -248,7 +235,7 @@ cdef DeviceMemoryResource DMR_register(DeviceMemoryResource self, uuid):
     return self
 
 
-cdef IPCAllocationHandle DMR_export_mempool(DeviceMemoryResource self):
+cdef IPCAllocationHandle MP_export_mempool(_MemPool self):
     # Note: This is Linux only (int for file descriptor)
     cdef int fd
     with nogil:
diff --git a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd
new file mode 100644
index 0000000000..eb40d3be12
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd
@@ -0,0 +1,27 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._memory._buffer cimport MemoryResource
+from cuda.core.experimental._memory._ipc cimport IPCDataForMR
+
+
+cdef class _MemPool(MemoryResource):
+    cdef:
+        int                   _dev_id
+        cydriver.CUmemoryPool _handle
+        bint                  _mempool_owned
+        IPCDataForMR          _ipc_data
+        object                _attributes
+        object                _peer_accessible_by
+        object                __weakref__
+
+
+cdef class _MemPoolOptions:
+
+    cdef:
+        bint _ipc_enabled
+        size_t _max_size
+        cydriver.CUmemLocationType _location
+        cydriver.CUmemAllocationType _type
diff --git a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
new file mode 100644
index 0000000000..c05e3e20a6
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
@@ -0,0 +1,404 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from libc.limits cimport ULLONG_MAX
+from libc.stdint cimport uintptr_t
+from libc.string cimport memset
+from cpython.mem cimport PyMem_Malloc, PyMem_Free
+
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._memory._buffer cimport Buffer, MemoryResource
+from cuda.core.experimental._memory cimport _ipc
+from cuda.core.experimental._stream cimport default_stream, Stream_accept, Stream
+from cuda.core.experimental._utils.cuda_utils cimport (
+    HANDLE_RETURN,
+)
+
+from typing import TYPE_CHECKING
+import platform  # no-cython-lint
+import weakref
+
+from cuda.core.experimental._utils.cuda_utils import driver
+
+if TYPE_CHECKING:
+    from cuda.core.experimental._memory.buffer import DevicePointerT
+    from .._device import Device
+
+
+cdef class _MemPoolOptions:
+
+    def __cinit__(self):
+        self._ipc_enabled = False
+        self._max_size = 0
+        self._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_INVALID
+        self._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_INVALID
+
+
+cdef class _MemPoolAttributes:
+    cdef:
+        object _mr_weakref
+
+    def __init__(self, *args, **kwargs):
+        raise RuntimeError("_MemPoolAttributes cannot be instantiated directly. Please use MemoryResource APIs.")
+
+    @classmethod
+    def _init(cls, mr):
+        cdef _MemPoolAttributes self = _MemPoolAttributes.__new__(cls)
+        self._mr_weakref = mr
+        return self
+
+    def __repr__(self):
+        return f"{self.__class__.__name__}(%s)" % ", ".join(
+            f"{attr}={getattr(self, attr)}" for attr in dir(self)
+                                            if not attr.startswith("_")
+        )
+
+    cdef int _getattribute(self, cydriver.CUmemPool_attribute attr_enum, void* value) except?-1:
+        cdef _MemPool mr = <_MemPool>(self._mr_weakref())
+        if mr is None:
+            raise RuntimeError("_MemPool is expired")
+        cdef cydriver.CUmemoryPool pool_handle = mr._handle
+        with nogil:
+            HANDLE_RETURN(cydriver.cuMemPoolGetAttribute(pool_handle, attr_enum, value))
+        return 0
+
+    @property
+    def reuse_follow_event_dependencies(self):
+        """Allow memory to be reused when there are event dependencies between streams."""
+        cdef int value
+        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_FOLLOW_EVENT_DEPENDENCIES, &value)
+        return bool(value)
+
+    @property
+    def reuse_allow_opportunistic(self):
+        """Allow reuse of completed frees without dependencies."""
+        cdef int value
+        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_OPPORTUNISTIC, &value)
+        return bool(value)
+
+    @property
+    def reuse_allow_internal_dependencies(self):
+        """Allow insertion of new stream dependencies for memory reuse."""
+        cdef int value
+        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_REUSE_ALLOW_INTERNAL_DEPENDENCIES, &value)
+        return bool(value)
+
+    @property
+    def release_threshold(self):
+        """Amount of reserved memory to hold before OS release."""
+        cdef cydriver.cuuint64_t value
+        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD, &value)
+        return int(value)
+
+    @property
+    def reserved_mem_current(self):
+        """Current amount of backing memory allocated."""
+        cdef cydriver.cuuint64_t value
+        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_CURRENT, &value)
+        return int(value)
+
+    @property
+    def reserved_mem_high(self):
+        """High watermark of backing memory allocated."""
+        cdef cydriver.cuuint64_t value
+        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RESERVED_MEM_HIGH, &value)
+        return int(value)
+
+    @property
+    def used_mem_current(self):
+        """Current amount of memory in use."""
+        cdef cydriver.cuuint64_t value
+        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_CURRENT, &value)
+        return int(value)
+
+    @property
+    def used_mem_high(self):
+        """High watermark of memory in use."""
+        cdef cydriver.cuuint64_t value
+        self._getattribute(cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_USED_MEM_HIGH, &value)
+        return int(value)
+
+
+cdef class _MemPool(MemoryResource):
+
+    def __cinit__(self):
+        self._dev_id = cydriver.CU_DEVICE_INVALID
+        self._handle = NULL
+        self._mempool_owned = False
+        self._ipc_data = None
+        self._attributes = None
+        self._peer_accessible_by = ()
+
+    def __init__(self, device_id: Device | int, _MemPoolOptions opts):
+        from .._device import Device
+        cdef int dev_id = Device(device_id).device_id
+
+        if opts is None:
+            _MP_init_current(self, dev_id)
+        else:
+            _MP_init_create(self, dev_id, opts)
+
+    def __dealloc__(self):
+        _MP_close(self)
+
+    def close(self):
+        """
+        Close the device memory resource and destroy the associated memory pool
+        if owned.
+        """
+        _MP_close(self)
+
+    def allocate(self, size_t size, stream: Stream | GraphBuilder | None = None) -> Buffer:
+        """Allocate a buffer of the requested size.
+
+        Parameters
+        ----------
+        size : int
+            The size of the buffer to allocate, in bytes.
+        stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
+            The stream on which to perform the allocation asynchronously.
+            If None, an internal stream is used.
+
+        Returns
+        -------
+        Buffer
+            The allocated buffer object, which is accessible on the device that this memory
+            resource was created for.
+        """
+        if self.is_mapped:
+            raise TypeError("Cannot allocate from a mapped IPC-enabled memory resource")
+        stream = Stream_accept(stream) if stream is not None else default_stream()
+        return _MP_allocate(self, size, <Stream> stream)
+
+    def deallocate(self, ptr: DevicePointerT, size_t size, stream: Stream | GraphBuilder | None = None):
+        """Deallocate a buffer previously allocated by this resource.
+
+        Parameters
+        ----------
+        ptr : :obj:`~_memory.DevicePointerT`
+            The pointer or handle to the buffer to deallocate.
+        size : int
+            The size of the buffer to deallocate, in bytes.
+        stream : :obj:`~_stream.Stream` | :obj:`~_graph.GraphBuilder`, optional
+            The stream on which to perform the deallocation asynchronously.
+            If the buffer is deallocated without an explicit stream, the allocation stream
+            is used.
+        """
+        stream = Stream_accept(stream) if stream is not None else default_stream()
+        _MP_deallocate(self, <uintptr_t>ptr, size, <Stream> stream)
+
+    @property
+    def attributes(self) -> _MemPoolAttributes:
+        """Memory pool attributes."""
+        if self._attributes is None:
+            ref = weakref.ref(self)
+            self._attributes = _MemPoolAttributes._init(ref)
+        return self._attributes
+
+    @property
+    def device_id(self) -> int:
+        """The associated device ordinal."""
+        return self._dev_id
+
+    @property
+    def handle(self) -> driver.CUmemoryPool:
+        """Handle to the underlying memory pool."""
+        return driver.CUmemoryPool(<uintptr_t>(self._handle))
+
+    @property
+    def is_handle_owned(self) -> bool:
+        """Whether the memory resource handle is owned. If False, ``close`` has no effect."""
+        return self._mempool_owned
+
+    @property
+    def peer_accessible_by(self):
+        """
+        Get or set the devices that can access allocations from this memory
+        pool. Access can be modified at any time and affects all allocations
+        from this memory pool.
+
+        Returns a tuple of sorted device IDs that currently have peer access to
+        allocations from this memory pool.
+
+        When setting, accepts a sequence of Device objects or device IDs.
+        Setting to an empty sequence revokes all peer access.
+
+        Examples
+        --------
+        >>> dmr = DeviceMemoryResource(0)
+        >>> dmr.peer_accessible_by = [1]  # Grant access to device 1
+        >>> assert dmr.peer_accessible_by == (1,)
+        >>> dmr.peer_accessible_by = []  # Revoke access
+        """
+        return self._peer_accessible_by
+
+    @peer_accessible_by.setter
+    def peer_accessible_by(self, devices):
+        """Set which devices can access this memory pool."""
+        from .._device import Device
+
+        # Convert all devices to device IDs
+        cdef set[int] target_ids = {Device(dev).device_id for dev in devices}
+        target_ids.discard(self._dev_id)  # exclude this device from peer access list
+        this_dev = Device(self._dev_id)
+        cdef list bad = [dev for dev in target_ids if not this_dev.can_access_peer(dev)]
+        if bad:
+            raise ValueError(f"Device {self._dev_id} cannot access peer(s): {', '.join(map(str, bad))}")
+        cdef set[int] cur_ids = set(self._peer_accessible_by)
+        cdef set[int] to_add = target_ids - cur_ids
+        cdef set[int] to_rm = cur_ids - target_ids
+        cdef size_t count = len(to_add) + len(to_rm) # transaction size
+        cdef cydriver.CUmemAccessDesc* access_desc = NULL
+        cdef size_t i = 0
+
+        if count > 0:
+            access_desc = <cydriver.CUmemAccessDesc*>PyMem_Malloc(count * sizeof(cydriver.CUmemAccessDesc))
+            if access_desc == NULL:
+                raise MemoryError("Failed to allocate memory for access descriptors")
+
+            try:
+                for dev_id in to_add:
+                    access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_READWRITE
+                    access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+                    access_desc[i].location.id = dev_id
+                    i += 1
+
+                for dev_id in to_rm:
+                    access_desc[i].flags = cydriver.CUmemAccess_flags.CU_MEM_ACCESS_FLAGS_PROT_NONE
+                    access_desc[i].location.type = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+                    access_desc[i].location.id = dev_id
+                    i += 1
+
+                with nogil:
+                    HANDLE_RETURN(cydriver.cuMemPoolSetAccess(self._handle, access_desc, count))
+            finally:
+                if access_desc != NULL:
+                    PyMem_Free(access_desc)
+
+            self._peer_accessible_by = tuple(target_ids)
+
+
+# _MemPool Implementation
+# -----------------------
+
+cdef int _MP_init_current(_MemPool self, int dev_id) except?-1:
+    # Get the current memory pool.
+    cdef cydriver.cuuint64_t current_threshold
+    cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX
+
+    self._dev_id = dev_id
+    self._mempool_owned = False
+
+    with nogil:
+        HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), dev_id))
+
+        # Set a higher release threshold to improve performance when there are
+        # no active allocations.  By default, the release threshold is 0, which
+        # means memory is immediately released back to the OS when there are no
+        # active suballocations, causing performance issues.
+        HANDLE_RETURN(
+            cydriver.cuMemPoolGetAttribute(
+                self._handle,
+                cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                &current_threshold
+            )
+        )
+
+        # If threshold is 0 (default), set it to maximum to retain memory in the pool.
+        if current_threshold == 0:
+            HANDLE_RETURN(cydriver.cuMemPoolSetAttribute(
+                self._handle,
+                cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                &max_threshold
+            ))
+
+    return 0
+
+
+cdef void _MP_init_create(
+    _MemPool self, int dev_id, _MemPoolOptions opts
+):
+    cdef cydriver.CUmemPoolProps properties
+    memset(&properties, 0, sizeof(cydriver.CUmemPoolProps))
+
+    cdef bint ipc_enabled = opts._ipc_enabled
+    properties.allocType = opts._type
+    properties.handleTypes = _ipc.IPC_HANDLE_TYPE if ipc_enabled else cydriver.CUmemAllocationHandleType.CU_MEM_HANDLE_TYPE_NONE
+    properties.location.id = dev_id
+    properties.location.type = opts._location
+    # managed memory does not support maxSize as of CUDA 13.0
+    if properties.allocType != cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED:
+        properties.maxSize = opts._max_size
+
+    self._dev_id = dev_id
+    self._mempool_owned = True
+
+    with nogil:
+        HANDLE_RETURN(cydriver.cuMemPoolCreate(&(self._handle), &properties))
+        # TODO: should we also set the threshold here?
+
+    if ipc_enabled:
+        alloc_handle = _ipc.MP_export_mempool(self)
+        self._ipc_data = _ipc.IPCDataForMR(alloc_handle, False)
+
+
+# Raise an exception if the given stream is capturing.
+# A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error.
+cdef inline int check_not_capturing(cydriver.CUstream s) except?-1 nogil:
+    cdef cydriver.CUstreamCaptureStatus capturing
+    HANDLE_RETURN(cydriver.cuStreamIsCapturing(s, &capturing))
+    if capturing != cydriver.CUstreamCaptureStatus.CU_STREAM_CAPTURE_STATUS_NONE:
+        raise RuntimeError("_MemPool cannot perform memory operations on "
+                           "a capturing stream (consider using GraphMemoryResource).")
+
+
+cdef inline Buffer _MP_allocate(_MemPool self, size_t size, Stream stream):
+    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUdeviceptr devptr
+    with nogil:
+        check_not_capturing(s)
+        HANDLE_RETURN(cydriver.cuMemAllocFromPoolAsync(&devptr, size, self._handle, s))
+    cdef Buffer buf = Buffer.__new__(Buffer)
+    buf._ptr = <uintptr_t>(devptr)
+    buf._ptr_obj = None
+    buf._size = size
+    buf._memory_resource = self
+    buf._alloc_stream = stream
+    return buf
+
+
+cdef inline void _MP_deallocate(
+    _MemPool self, uintptr_t ptr, size_t size, Stream stream
+) noexcept:
+    cdef cydriver.CUstream s = stream._handle
+    cdef cydriver.CUdeviceptr devptr = <cydriver.CUdeviceptr>ptr
+    cdef cydriver.CUresult r
+    with nogil:
+        r = cydriver.cuMemFreeAsync(devptr, s)
+        if r != cydriver.CUDA_ERROR_INVALID_CONTEXT:
+            HANDLE_RETURN(r)
+
+
+cdef inline _MP_close(_MemPool self):
+    if self._handle == NULL:
+        return
+
+    # This works around nvbug 5698116. When a memory pool handle is recycled
+    # the new handle inherits the peer access state of the previous handle.
+    if self._peer_accessible_by:
+        self.peer_accessible_by = []
+
+    try:
+        if self._mempool_owned:
+            with nogil:
+                HANDLE_RETURN(cydriver.cuMemPoolDestroy(self._handle))
+    finally:
+        self._dev_id = cydriver.CU_DEVICE_INVALID
+        self._handle = NULL
+        self._attributes = None
+        self._mempool_owned = False
+        self._ipc_data = None
+        self._peer_accessible_by = ()
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index be46802493..2dceeb494b 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -880,7 +880,7 @@ def test_mempool_attributes_repr(mempool_device):
     buffer2 = mr.allocate(64)
     buffer1.close()
     assert re.match(
-        r"DeviceMemoryResourceAttributes\(release_threshold=\d+, reserved_mem_current=\d+, reserved_mem_high=\d+, "
+        r".*Attributes\(release_threshold=\d+, reserved_mem_current=\d+, reserved_mem_high=\d+, "
         r"reuse_allow_internal_dependencies=(True|False), reuse_allow_opportunistic=(True|False), "
         r"reuse_follow_event_dependencies=(True|False), used_mem_current=\d+, used_mem_high=\d+\)",
         str(mr.attributes),
@@ -901,13 +901,13 @@ def test_mempool_attributes_ownership(mempool_device):
     del mr
 
     # After deleting the memory resource, the attributes suite is disconnected.
-    with pytest.raises(RuntimeError, match="DeviceMemoryResource is expired"):
+    with pytest.raises(RuntimeError, match="is expired"):
         _ = attributes.used_mem_high
 
     # Even when a new object is created (we found a case where the same
     # mempool handle was really reused).
     mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE))  # noqa: F841
-    with pytest.raises(RuntimeError, match="DeviceMemoryResource is expired"):
+    with pytest.raises(RuntimeError, match="is expired"):
         _ = attributes.used_mem_high
 
 

From a73737855172cb119df89b87fc62a6adc31b0e7c Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 11 Dec 2025 05:54:57 +0000
Subject: [PATCH 02/14] add pinned memory resource

---
 cuda_core/build_hooks.py                      |   2 +-
 cuda_core/cuda/core/experimental/__init__.py  |   2 +
 cuda_core/cuda/core/experimental/_device.pyx  |   2 +-
 .../core/experimental/_memory/__init__.py     |   1 +
 .../_memory/_device_memory_resource.pyx       |   1 +
 .../experimental/_memory/_memory_pool.pxd     |   1 +
 .../experimental/_memory/_memory_pool.pyx     |  75 ++++---
 .../_memory/_pinned_memory_resource.pxd       |  10 +
 .../_memory/_pinned_memory_resource.pyx       | 184 ++++++++++++++++++
 cuda_core/tests/conftest.py                   |  37 +++-
 cuda_core/tests/test_graph_mem.py             |   2 +-
 cuda_core/tests/test_memory.py                | 155 +++++++++++----
 .../tests/test_multiprocessing_warning.py     |   6 +-
 13 files changed, 397 insertions(+), 81 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pxd
 create mode 100644 cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx

diff --git a/cuda_core/build_hooks.py b/cuda_core/build_hooks.py
index e38f5676df..6191dcb706 100644
--- a/cuda_core/build_hooks.py
+++ b/cuda_core/build_hooks.py
@@ -95,7 +95,7 @@ def get_cuda_paths():
     )
 
     nthreads = int(os.environ.get("CUDA_PYTHON_PARALLEL_LEVEL", os.cpu_count() // 2))
-    compile_time_env = {"CUDA_CORE_BUILD_MAJOR": _get_proper_cuda_bindings_major_version()}
+    compile_time_env = {"CUDA_CORE_BUILD_MAJOR": int(_get_proper_cuda_bindings_major_version())}
     _extensions = cythonize(
         ext_modules,
         verbose=True,
diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index 826ea70b97..ab8748bce3 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -45,6 +45,8 @@
     GraphMemoryResource,
     LegacyPinnedMemoryResource,
     MemoryResource,
+    PinnedMemoryResource,
+    PinnedMemoryResourceOptions,
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
diff --git a/cuda_core/cuda/core/experimental/_device.pyx b/cuda_core/cuda/core/experimental/_device.pyx
index cd802943a5..8ebbb7b8d7 100644
--- a/cuda_core/cuda/core/experimental/_device.pyx
+++ b/cuda_core/cuda/core/experimental/_device.pyx
@@ -1080,7 +1080,7 @@ class Device:
         if self._uuid is None:
             dev = self._id
             with nogil:
-                IF CUDA_CORE_BUILD_MAJOR == "12":
+                IF CUDA_CORE_BUILD_MAJOR == 12:
                     HANDLE_RETURN(cydriver.cuDeviceGetUuid_v2(&uuid, dev))
                 ELSE:  # 13.0+
                     HANDLE_RETURN(cydriver.cuDeviceGetUuid(&uuid, dev))
diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
index 20b90d7fdd..b36decf96c 100644
--- a/cuda_core/cuda/core/experimental/_memory/__init__.py
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -7,4 +7,5 @@
 from ._graph_memory_resource import *  # noqa: F403
 from ._ipc import *  # noqa: F403
 from ._legacy import *  # noqa: F403
+from ._pinned_memory_resource import *  # noqa: F403
 from ._virtual_memory_resource import *  # noqa: F403
diff --git a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
index 03389dbd6a..49c590374e 100644
--- a/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_device_memory_resource.pyx
@@ -140,6 +140,7 @@ cdef class DeviceMemoryResource(_MemPool):
             if ipc_enabled and not _ipc.is_supported():
                 raise RuntimeError("IPC is not available on {platform.system()}")
             opts_base._max_size = opts.max_size
+            opts_base._use_current = False
         opts_base._ipc_enabled = ipc_enabled
         opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
         opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
diff --git a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd
index eb40d3be12..68b2e6438f 100644
--- a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd
+++ b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pxd
@@ -25,3 +25,4 @@ cdef class _MemPoolOptions:
         size_t _max_size
         cydriver.CUmemLocationType _location
         cydriver.CUmemAllocationType _type
+        bint _use_current
diff --git a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
index c05e3e20a6..b4e86372dd 100644
--- a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
@@ -35,6 +35,7 @@ cdef class _MemPoolOptions:
         self._max_size = 0
         self._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_INVALID
         self._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_INVALID
+        self._use_current = True
 
 
 cdef class _MemPoolAttributes:
@@ -132,14 +133,11 @@ cdef class _MemPool(MemoryResource):
         self._attributes = None
         self._peer_accessible_by = ()
 
-    def __init__(self, device_id: Device | int, _MemPoolOptions opts):
-        from .._device import Device
-        cdef int dev_id = Device(device_id).device_id
-
-        if opts is None:
-            _MP_init_current(self, dev_id)
+    def __init__(self, int device_id, _MemPoolOptions opts):
+        if opts._use_current:
+            _MP_init_current(self, device_id, opts)
         else:
-            _MP_init_create(self, dev_id, opts)
+            _MP_init_create(self, device_id, opts)
 
     def __dealloc__(self):
         _MP_close(self)
@@ -284,43 +282,58 @@ cdef class _MemPool(MemoryResource):
 # _MemPool Implementation
 # -----------------------
 
-cdef int _MP_init_current(_MemPool self, int dev_id) except?-1:
+cdef int _MP_init_current(_MemPool self, int dev_id, _MemPoolOptions opts) except?-1:
     # Get the current memory pool.
     cdef cydriver.cuuint64_t current_threshold
     cdef cydriver.cuuint64_t max_threshold = ULLONG_MAX
+    cdef cydriver.CUmemLocation loc
 
     self._dev_id = dev_id
     self._mempool_owned = False
 
     with nogil:
-        HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), dev_id))
-
-        # Set a higher release threshold to improve performance when there are
-        # no active allocations.  By default, the release threshold is 0, which
-        # means memory is immediately released back to the OS when there are no
-        # active suballocations, causing performance issues.
-        HANDLE_RETURN(
-            cydriver.cuMemPoolGetAttribute(
-                self._handle,
-                cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
-                &current_threshold
+        if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \
+                and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE:
+            assert dev_id >= 0
+            HANDLE_RETURN(cydriver.cuDeviceGetMemPool(&(self._handle), dev_id))
+
+            # Set a higher release threshold to improve performance when there are
+            # no active allocations.  By default, the release threshold is 0, which
+            # means memory is immediately released back to the OS when there are no
+            # active suballocations, causing performance issues.
+            HANDLE_RETURN(
+                cydriver.cuMemPoolGetAttribute(
+                    self._handle,
+                    cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                    &current_threshold
+                )
             )
-        )
 
-        # If threshold is 0 (default), set it to maximum to retain memory in the pool.
-        if current_threshold == 0:
-            HANDLE_RETURN(cydriver.cuMemPoolSetAttribute(
-                self._handle,
-                cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
-                &max_threshold
-            ))
+            # If threshold is 0 (default), set it to maximum to retain memory in the pool.
+            if current_threshold == 0:
+                HANDLE_RETURN(cydriver.cuMemPoolSetAttribute(
+                    self._handle,
+                    cydriver.CUmemPool_attribute.CU_MEMPOOL_ATTR_RELEASE_THRESHOLD,
+                    &max_threshold
+                ))
+        elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \
+                and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST:
+            IF CUDA_CORE_BUILD_MAJOR >= 13:
+                assert dev_id == -1
+                loc.id = dev_id
+                loc.type = opts._location
+                HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type))
+            ELSE:
+                raise RuntimeError("not supported")
+        #TODO
+        #elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED
+        else:
+            assert False
 
     return 0
 
 
-cdef void _MP_init_create(
-    _MemPool self, int dev_id, _MemPoolOptions opts
-):
+cdef int _MP_init_create(_MemPool self, int dev_id, _MemPoolOptions opts) except?-1:
     cdef cydriver.CUmemPoolProps properties
     memset(&properties, 0, sizeof(cydriver.CUmemPoolProps))
 
@@ -344,6 +357,8 @@ cdef void _MP_init_create(
         alloc_handle = _ipc.MP_export_mempool(self)
         self._ipc_data = _ipc.IPCDataForMR(alloc_handle, False)
 
+    return 0
+
 
 # Raise an exception if the given stream is capturing.
 # A result of CU_STREAM_CAPTURE_STATUS_INVALIDATED is considered an error.
diff --git a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pxd b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pxd
new file mode 100644
index 0000000000..df225c1860
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pxd
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental._memory._memory_pool cimport _MemPool
+from cuda.core.experimental._memory._ipc cimport IPCDataForMR
+
+
+cdef class PinnedMemoryResource(_MemPool):
+    pass
diff --git a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
new file mode 100644
index 0000000000..799bf90a90
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
@@ -0,0 +1,184 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._memory._memory_pool cimport _MemPool, _MemPoolOptions
+from cuda.core.experimental._memory cimport _ipc
+from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle
+from cuda.core.experimental._utils.cuda_utils cimport (
+    check_or_create_options,
+)
+
+from dataclasses import dataclass
+from typing import Optional
+import platform  # no-cython-lint
+import uuid
+
+
+__all__ = ['PinnedMemoryResource', 'PinnedMemoryResourceOptions']
+
+
+@dataclass
+cdef class PinnedMemoryResourceOptions:
+    """Customizable :obj:`~_memory.PinnedMemoryResource` options.
+
+    Attributes
+    ----------
+    max_size : int, optional
+        Maximum pool size. When set to 0, defaults to a system-dependent value.
+        (Default to 0)
+    """
+    max_size : int = 0
+
+
+cdef class PinnedMemoryResource(_MemPool):
+    """
+    A host-pinned memory resource managing a stream-ordered memory pool.
+
+    Parameters
+    ----------
+    options : PinnedMemoryResourceOptions
+        Memory resource creation options.
+
+        If set to `None`, the memory resource uses the driver's current
+        stream-ordered memory pool. If no memory
+        pool is set as current, the driver's default memory pool
+        is used.
+
+        If not set to `None`, a new memory pool is created, which is owned by
+        the memory resource.
+
+        When using an existing (current or default) memory pool, the returned
+        host-pinned memory resource does not own the pool (`is_handle_owned` is
+        `False`), and closing the resource has no effect.
+
+    Notes
+    -----
+    IPC (Inter-Process Communication) is not currently supported for pinned
+    memory pools.
+    """
+
+    def __init__(self, options=None):
+        cdef PinnedMemoryResourceOptions opts = check_or_create_options(
+            PinnedMemoryResourceOptions, options, "PinnedMemoryResource options",
+            keep_none=True
+        )
+        cdef _MemPoolOptions opts_base = _MemPoolOptions()
+
+        if opts:
+            opts_base._max_size = opts.max_size
+            opts_base._use_current = False
+        opts_base._ipc_enabled = False  # IPC not supported for pinned memory pools
+        opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
+        opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
+
+        super().__init__(-1, opts_base)
+
+    def __reduce__(self):
+        return PinnedMemoryResource.from_registry, (self.uuid,)
+
+    @staticmethod
+    def from_registry(uuid: uuid.UUID) -> PinnedMemoryResource:  # no-cython-lint
+        """
+        Obtain a registered mapped memory resource.
+
+        Raises
+        ------
+        RuntimeError
+            If no mapped memory resource is found in the registry.
+        """
+        return <PinnedMemoryResource>(_ipc.MP_from_registry(uuid))
+
+    def register(self, uuid: uuid.UUID) -> PinnedMemoryResource:  # no-cython-lint
+        """
+        Register a mapped memory resource.
+
+        Returns
+        -------
+        The registered mapped memory resource. If one was previously registered
+        with the given key, it is returned.
+        """
+        return <PinnedMemoryResource>(_ipc.MP_register(self, uuid))
+
+    @classmethod
+    def from_allocation_handle(
+        cls, alloc_handle: int | IPCAllocationHandle
+    ) -> PinnedMemoryResource:
+        """Create a host-pinned memory resource from an allocation handle.
+
+        Construct a new `PinnedMemoryResource` instance that imports a memory
+        pool from a shareable handle. The memory pool is marked as owned.
+
+        Parameters
+        ----------
+        alloc_handle : int | IPCAllocationHandle
+            The shareable handle of the host-pinned memory resource to import. If an
+            integer is supplied, it must represent a valid platform-specific
+            handle. It is the caller's responsibility to close that handle.
+
+        Returns
+        -------
+            A new host-pinned memory resource instance with the imported handle.
+        """
+        cdef PinnedMemoryResource mr = <PinnedMemoryResource>(
+            _ipc.MP_from_allocation_handle(cls, alloc_handle))
+        return mr
+
+    def get_allocation_handle(self) -> IPCAllocationHandle:
+        """Export the memory pool handle to be shared (requires IPC).
+
+        The handle can be used to share the memory pool with other processes.
+        The handle is cached in this `MemoryResource` and owned by it.
+
+        Returns
+        -------
+            The shareable handle for the memory pool.
+
+        Raises
+        ------
+        RuntimeError
+            IPC is not currently supported for pinned memory pools.
+        """
+        raise RuntimeError("IPC is not currently supported for pinned memory pools")
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """Return True. This memory resource provides device-accessible buffers."""
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """Return True. This memory resource provides host-accessible buffers."""
+        return True
+
+    @property
+    def is_ipc_enabled(self) -> bool:
+        """Whether this memory resource has IPC enabled."""
+        return self._ipc_data is not None
+
+    @property
+    def is_mapped(self) -> bool:
+        """
+        Whether this is a mapping of an IPC-enabled memory resource from
+        another process.  If True, allocation is not permitted.
+        """
+        return self._ipc_data is not None and self._ipc_data._is_mapped
+
+    @property
+    def uuid(self) -> Optional[uuid.UUID]:
+        """
+        A universally unique identifier for this memory resource. Meaningful
+        only for IPC-enabled memory resources.
+        """
+        return getattr(self._ipc_data, 'uuid', None)
+
+
+def _deep_reduce_pinned_memory_resource(mr):
+    raise RuntimeError("IPC is not currently supported for pinned memory pools")
+
+
+# Multiprocessing support disabled until IPC is supported for pinned memory pools
+# multiprocessing.reduction.register(PinnedMemoryResource, _deep_reduce_pinned_memory_resource)
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index c0ea03930e..36b6dc4b32 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -13,7 +13,14 @@
     from cuda import cuda as driver
 
 import cuda.core.experimental
-from cuda.core.experimental import Device, DeviceMemoryResource, DeviceMemoryResourceOptions, _device
+from cuda.core.experimental import (
+    Device,
+    DeviceMemoryResource,
+    DeviceMemoryResourceOptions,
+    PinnedMemoryResource,
+    PinnedMemoryResourceOptions,
+    _device,
+)
 from cuda.core.experimental._utils.cuda_utils import handle_return
 
 
@@ -153,4 +160,32 @@ def mempool_device_x3():
     return _mempool_device_impl(3)
 
 
+@pytest.fixture(
+    params=[
+        pytest.param((DeviceMemoryResource, DeviceMemoryResourceOptions, True), id="DeviceMR-device_object"),
+        pytest.param((DeviceMemoryResource, DeviceMemoryResourceOptions, False), id="DeviceMR-device_id"),
+        pytest.param((PinnedMemoryResource, PinnedMemoryResourceOptions, None), id="PinnedMR"),
+    ]
+)
+def memory_resource_factory_with_device(request, init_cuda):
+    """Parametrized fixture providing memory resource types with device usage configuration.
+
+    Returns a 3-tuple of (MRClass, MROptionClass, use_device_object).
+    For DeviceMemoryResource, use_device_object is True/False indicating whether to pass
+    a Device object or device_id. For PinnedMemoryResource, use_device_object is None
+    as it doesn't require a device parameter.
+
+    Usage:
+        def test_something(memory_resource_factory_with_device):
+            MRClass, MROptions, use_device_object = memory_resource_factory_with_device
+            device = Device(0)
+            if MRClass is DeviceMemoryResource:
+                device_arg = device if use_device_object else device.device_id
+                mr = MRClass(device_arg)
+            elif MRClass is PinnedMemoryResource:
+                mr = MRClass()
+    """
+    return request.param
+
+
 skipif_need_cuda_headers = pytest.mark.skipif(helpers.CUDA_INCLUDE_PATH is None, reason="need CUDA header")
diff --git a/cuda_core/tests/test_graph_mem.py b/cuda_core/tests/test_graph_mem.py
index 964ce03b93..15b34dc359 100644
--- a/cuda_core/tests/test_graph_mem.py
+++ b/cuda_core/tests/test_graph_mem.py
@@ -275,7 +275,7 @@ def test_dmr_check_capture_state(mempool_device, mode):
     gb = device.create_graph_builder().begin_building(mode=mode)
     with pytest.raises(
         RuntimeError,
-        match=r"DeviceMemoryResource cannot perform memory operations on a capturing "
+        match=r"cannot perform memory operations on a capturing "
         r"stream \(consider using GraphMemoryResource\)\.",
     ):
         dmr.allocate(1, stream=gb)
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 2dceeb494b..c990405894 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -24,6 +24,7 @@
     DeviceMemoryResourceOptions,
     GraphMemoryResource,
     MemoryResource,
+    PinnedMemoryResource,
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
@@ -132,6 +133,8 @@ def test_package_contents():
         "IPCBufferDescriptor",
         "IPCAllocationHandle",
         "LegacyPinnedMemoryResource",
+        "PinnedMemoryResourceOptions",
+        "PinnedMemoryResource",
         "VirtualMemoryResourceOptions",
         "VirtualMemoryResource",
     ]
@@ -512,35 +515,42 @@ def test_buffer_dlpack_failure_clean_up():
     assert after == before
 
 
-@pytest.mark.parametrize("use_device_object", [True, False])
-def test_device_memory_resource_initialization(use_device_object):
-    """Test that DeviceMemoryResource can be initialized successfully.
-
-    This test verifies that the DeviceMemoryResource initializes properly,
-    including the release threshold configuration for performance optimization.
-    """
+def test_modern_device_memory_resource_initialization(memory_resource_factory_with_device):
     device = Device()
+    MR, MRops, use_device_object = memory_resource_factory_with_device
 
-    if not device.properties.memory_pools_supported:
+    if MR is DeviceMemoryResource and not device.properties.memory_pools_supported:
         pytest.skip("Device does not support mempool operations")
+    elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
+        pytest.skip("Device does not support host mempool operations")
 
     device.set_current()
 
-    # This should succeed and configure the memory pool release threshold.
-    # The resource can be constructed from either a device or device ordinal.
-    device_arg = device if use_device_object else device.device_id
-    mr = DeviceMemoryResource(device_arg)
-
-    # Verify basic properties
-    assert mr.device_id == device.device_id
-    assert mr.is_device_accessible
-    assert not mr.is_host_accessible
-    assert not mr.is_ipc_enabled
+    if MR is DeviceMemoryResource:
+        # This should succeed and configure the memory pool release threshold.
+        # The resource can be constructed from either a device or device ordinal.
+        device_arg = device if use_device_object else device.device_id
+        mr = MR(device_arg)
+        assert mr.device_id == device.device_id
+        assert mr.is_device_accessible
+        assert not mr.is_host_accessible
+        assert not mr.is_ipc_enabled
+    elif MR is PinnedMemoryResource:
+        mr = PinnedMemoryResource()
+        assert mr.is_device_accessible
+        assert mr.is_host_accessible
 
     # Test allocation/deallocation works
     buffer = mr.allocate(1024)
     assert buffer.size == 1024
-    assert buffer.device_id == device.device_id
+    if MR is DeviceMemoryResource:
+        assert buffer.device_id == device.device_id
+        assert not buffer.is_host_accessible
+    elif MR is PinnedMemoryResource:
+        assert buffer.device_id == -1  # Not bound to any GPU
+        assert buffer.is_host_accessible
+    assert buffer.memory_resource == mr
+    assert buffer.is_device_accessible
     buffer.close()
 
 
@@ -745,21 +755,32 @@ def test_vmm_allocator_rdma_unsupported_exception():
         VirtualMemoryResource(device, config=options)
 
 
-def test_device_memory_resource():
+def test_modern_memory_resources(memory_resource_factory_with_device):
     device = Device()
 
-    if not device.properties.memory_pools_supported:
+    MR, MRops, _ = memory_resource_factory_with_device
+
+    if MR is DeviceMemoryResource and not device.properties.memory_pools_supported:
         pytest.skip("Device does not support mempool operations")
+    elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
+        pytest.skip("Device does not support host mempool operations")
 
     device.set_current()
 
     # Test basic pool creation
-    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE)
-    mr = DeviceMemoryResource(device, options=options)
-    assert mr.device_id == device.device_id
-    assert mr.is_device_accessible
-    assert not mr.is_host_accessible
-    assert not mr.is_ipc_enabled
+    options = MRops(max_size=POOL_SIZE)
+    if MR is DeviceMemoryResource:
+        mr = MR(device, options=options)
+        assert mr.device_id == device.device_id
+        assert mr.is_device_accessible
+        assert not mr.is_host_accessible
+        assert not mr.is_ipc_enabled
+    elif MR is PinnedMemoryResource:
+        mr = MR(options)
+        assert mr.device_id == -1  # Not bound to any GPU
+        assert mr.is_device_accessible
+        assert mr.is_host_accessible
+        assert not mr.is_ipc_enabled
 
     # Test allocation and deallocation
     buffer1 = mr.allocate(1024)
@@ -781,7 +802,7 @@ def test_device_memory_resource():
     stream = device.create_stream()
     buffer = mr.allocate(1024, stream=stream)
     assert buffer.handle != 0
-    buffer.close()
+    buffer.close(stream)
 
     # Test memory copying between buffers from same pool
     src_buffer = mr.allocate(64)
@@ -828,18 +849,36 @@ def test_mempool_ipc_errors(mempool_device):
         ("used_mem_high", int),
     ],
 )
-def test_mempool_attributes(ipc_enabled, mempool_device, property_name, expected_type):
-    """Test all properties of the DeviceMemoryResource class."""
-    device = mempool_device
+def test_mempool_attributes(ipc_enabled, memory_resource_factory_with_device, property_name, expected_type):
+    """Test all properties of memory pool attributes for DeviceMemoryResource and PinnedMemoryResource."""
+    MR, MRops, _ = memory_resource_factory_with_device
+    device = Device()
+
+    if MR is DeviceMemoryResource and not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
+    elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
+        pytest.skip("Device does not support host mempool operations")
+
+    # PinnedMemoryResource does not support IPC
+    if MR is PinnedMemoryResource and ipc_enabled:
+        pytest.skip("PinnedMemoryResource does not support IPC")
+
+    device.set_current()
+
     if platform.system() == "Windows":
         return  # IPC not implemented for Windows
 
     if ipc_enabled and not supports_ipc_mempool(device):
         pytest.skip("Driver rejects IPC-enabled mempool creation on this platform")
 
-    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=ipc_enabled)
-    mr = DeviceMemoryResource(device, options=options)
-    assert mr.is_ipc_enabled == ipc_enabled
+    if MR is DeviceMemoryResource:
+        options = MRops(max_size=POOL_SIZE, ipc_enabled=ipc_enabled)
+        mr = MR(device, options=options)
+        assert mr.is_ipc_enabled == ipc_enabled
+    elif MR is PinnedMemoryResource:
+        options = MRops(max_size=POOL_SIZE)
+        mr = MR(options)
+        assert not mr.is_ipc_enabled
 
     # Get the property value
     value = getattr(mr.attributes, property_name)
@@ -872,10 +911,23 @@ def test_mempool_attributes(ipc_enabled, mempool_device, property_name, expected
         assert value >= current_value, f"{property_name} should be >= {current_prop}"
 
 
-def test_mempool_attributes_repr(mempool_device):
+def test_mempool_attributes_repr(memory_resource_factory_with_device):
+    """Test the repr of memory pool attributes for both DeviceMemoryResource and PinnedMemoryResource."""
+    MR, MRops, _ = memory_resource_factory_with_device
     device = Device()
+
+    if MR is DeviceMemoryResource and not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
+    elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
+        pytest.skip("Device does not support host mempool operations")
+
     device.set_current()
-    mr = DeviceMemoryResource(device, options={"max_size": 2048})
+
+    if MR is DeviceMemoryResource:
+        mr = MR(device, options={"max_size": 2048})
+    elif MR is PinnedMemoryResource:
+        mr = MR(options={"max_size": 2048})
+
     buffer1 = mr.allocate(64)
     buffer2 = mr.allocate(64)
     buffer1.close()
@@ -888,14 +940,27 @@ def test_mempool_attributes_repr(mempool_device):
     buffer2.close()
 
 
-def test_mempool_attributes_ownership(mempool_device):
-    """Ensure the attributes bundle handles references correctly."""
-    device = mempool_device
-    # Skip if IPC mempool is not supported on this platform/device
-    if not supports_ipc_mempool(device):
+def test_mempool_attributes_ownership(memory_resource_factory_with_device):
+    """Ensure the attributes bundle handles references correctly for both memory resource types."""
+    MR, MRops, _ = memory_resource_factory_with_device
+    device = Device()
+
+    if MR is DeviceMemoryResource and not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
+    elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
+        pytest.skip("Device does not support host mempool operations")
+
+    # Skip if IPC mempool is not supported on this platform/device (only relevant for DeviceMemoryResource)
+    if MR is DeviceMemoryResource and not supports_ipc_mempool(device):
         pytest.skip("Driver rejects IPC-enabled mempool creation on this platform")
 
-    mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE))
+    device.set_current()
+
+    if MR is DeviceMemoryResource:
+        mr = MR(device, dict(max_size=POOL_SIZE))
+    elif MR is PinnedMemoryResource:
+        mr = MR(dict(max_size=POOL_SIZE))
+
     attributes = mr.attributes
     mr.close()
     del mr
@@ -906,7 +971,11 @@ def test_mempool_attributes_ownership(mempool_device):
 
     # Even when a new object is created (we found a case where the same
     # mempool handle was really reused).
-    mr = DeviceMemoryResource(device, dict(max_size=POOL_SIZE))  # noqa: F841
+    if MR is DeviceMemoryResource:
+        mr = MR(device, dict(max_size=POOL_SIZE))  # noqa: F841
+    elif MR is PinnedMemoryResource:
+        mr = MR(dict(max_size=POOL_SIZE))  # noqa: F841
+
     with pytest.raises(RuntimeError, match="is expired"):
         _ = attributes.used_mem_high
 
diff --git a/cuda_core/tests/test_multiprocessing_warning.py b/cuda_core/tests/test_multiprocessing_warning.py
index 945ea83964..8b490af233 100644
--- a/cuda_core/tests/test_multiprocessing_warning.py
+++ b/cuda_core/tests/test_multiprocessing_warning.py
@@ -14,10 +14,8 @@
 
 from cuda.core.experimental import DeviceMemoryResource, DeviceMemoryResourceOptions, EventOptions
 from cuda.core.experimental._event import _reduce_event
-from cuda.core.experimental._memory._ipc import (
-    _deep_reduce_device_memory_resource,
-    _reduce_allocation_handle,
-)
+from cuda.core.experimental._memory._device_memory_resource import _deep_reduce_device_memory_resource
+from cuda.core.experimental._memory._ipc import _reduce_allocation_handle
 from cuda.core.experimental._utils.cuda_utils import reset_fork_warning
 
 

From ac048f503e08c55bc321bb8ffe507b0d81012876 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Thu, 11 Dec 2025 20:28:19 +0000
Subject: [PATCH 03/14] add managed memory resource

---
 cuda_core/cuda/core/experimental/__init__.py  |   2 +
 .../core/experimental/_memory/__init__.py     |   1 +
 .../_memory/_managed_memory_resource.pxd      |   9 +
 .../_memory/_managed_memory_resource.pyx      | 201 ++++++++++++++++++
 .../experimental/_memory/_memory_pool.pyx     |  10 +-
 cuda_core/tests/conftest.py                   |   9 +-
 cuda_core/tests/test_memory.py                |  47 +++-
 7 files changed, 265 insertions(+), 14 deletions(-)
 create mode 100644 cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pxd
 create mode 100644 cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx

diff --git a/cuda_core/cuda/core/experimental/__init__.py b/cuda_core/cuda/core/experimental/__init__.py
index ab8748bce3..92174468d1 100644
--- a/cuda_core/cuda/core/experimental/__init__.py
+++ b/cuda_core/cuda/core/experimental/__init__.py
@@ -44,6 +44,8 @@
     DeviceMemoryResourceOptions,
     GraphMemoryResource,
     LegacyPinnedMemoryResource,
+    ManagedMemoryResource,
+    ManagedMemoryResourceOptions,
     MemoryResource,
     PinnedMemoryResource,
     PinnedMemoryResourceOptions,
diff --git a/cuda_core/cuda/core/experimental/_memory/__init__.py b/cuda_core/cuda/core/experimental/_memory/__init__.py
index b36decf96c..9d141ebca2 100644
--- a/cuda_core/cuda/core/experimental/_memory/__init__.py
+++ b/cuda_core/cuda/core/experimental/_memory/__init__.py
@@ -7,5 +7,6 @@
 from ._graph_memory_resource import *  # noqa: F403
 from ._ipc import *  # noqa: F403
 from ._legacy import *  # noqa: F403
+from ._managed_memory_resource import *  # noqa: F403
 from ._pinned_memory_resource import *  # noqa: F403
 from ._virtual_memory_resource import *  # noqa: F403
diff --git a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pxd b/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pxd
new file mode 100644
index 0000000000..3e9aed7bee
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pxd
@@ -0,0 +1,9 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from cuda.core.experimental._memory._memory_pool cimport _MemPool
+
+
+cdef class ManagedMemoryResource(_MemPool):
+    pass
diff --git a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
new file mode 100644
index 0000000000..0b74833054
--- /dev/null
+++ b/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
@@ -0,0 +1,201 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from __future__ import annotations
+
+from cuda.bindings cimport cydriver
+from cuda.core.experimental._memory._memory_pool cimport _MemPool, _MemPoolOptions
+from cuda.core.experimental._memory cimport _ipc
+from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle
+from cuda.core.experimental._utils.cuda_utils cimport (
+    check_or_create_options,
+)
+
+from dataclasses import dataclass
+from typing import Optional
+import uuid
+
+__all__ = ['ManagedMemoryResource', 'ManagedMemoryResourceOptions']
+
+
+@dataclass
+cdef class ManagedMemoryResourceOptions:
+    """Customizable :obj:`~_memory.ManagedMemoryResource` options.
+
+    Attributes
+    ----------
+    preferred_location : int, optional
+        The preferred device location for the managed memory.
+        Use a device ID (0, 1, 2, ...) for device preference, or -1 for CPU/host.
+        (Default to -1 for CPU/host)
+
+    max_size : int, optional
+        Maximum pool size. When set to 0, defaults to a system-dependent value.
+        (Default to 0)
+    """
+    preferred_location : int = -1
+    max_size : int = 0
+
+
+cdef class ManagedMemoryResource(_MemPool):
+    """
+    A managed memory resource managing a stream-ordered memory pool.
+
+    Managed memory is accessible from both the host and device, with automatic
+    migration between them as needed.
+
+    Parameters
+    ----------
+    options : ManagedMemoryResourceOptions
+        Memory resource creation options.
+
+        If set to `None`, the memory resource uses the driver's current
+        stream-ordered memory pool. If no memory pool is set as current,
+        the driver's default memory pool is used.
+
+        If not set to `None`, a new memory pool is created, which is owned by
+        the memory resource.
+
+        When using an existing (current or default) memory pool, the returned
+        managed memory resource does not own the pool (`is_handle_owned` is
+        `False`), and closing the resource has no effect.
+
+    Notes
+    -----
+    IPC (Inter-Process Communication) is not currently supported for managed
+    memory pools.
+    """
+
+    def __init__(self, options=None):
+        cdef ManagedMemoryResourceOptions opts = check_or_create_options(
+            ManagedMemoryResourceOptions, options, "ManagedMemoryResource options",
+            keep_none=True
+        )
+        cdef _MemPoolOptions opts_base = _MemPoolOptions()
+
+        cdef int device_id = -1  # Default: CPU/host preference
+        if opts:
+            device_id = opts.preferred_location
+            opts_base._max_size = opts.max_size
+            opts_base._use_current = False
+
+        opts_base._ipc_enabled = False  # IPC not supported for managed memory pools
+
+        # Set location based on preferred_location
+        if device_id == -1:
+            # CPU/host preference
+            opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
+        else:
+            # Device preference
+            opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+
+        opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED
+
+        super().__init__(device_id, opts_base)
+
+    def __reduce__(self):
+        return ManagedMemoryResource.from_registry, (self.uuid,)
+
+    @staticmethod
+    def from_registry(uuid: uuid.UUID) -> ManagedMemoryResource:  # no-cython-lint
+        """
+        Obtain a registered mapped memory resource.
+
+        Raises
+        ------
+        RuntimeError
+            If no mapped memory resource is found in the registry.
+        """
+        return <ManagedMemoryResource>(_ipc.MP_from_registry(uuid))
+
+    def register(self, uuid: uuid.UUID) -> ManagedMemoryResource:  # no-cython-lint
+        """
+        Register a mapped memory resource.
+
+        Returns
+        -------
+        The registered mapped memory resource. If one was previously registered
+        with the given key, it is returned.
+        """
+        return <ManagedMemoryResource>(_ipc.MP_register(self, uuid))
+
+    @classmethod
+    def from_allocation_handle(
+        cls, alloc_handle: int | IPCAllocationHandle
+    ) -> ManagedMemoryResource:
+        """Create a managed memory resource from an allocation handle.
+
+        Construct a new `ManagedMemoryResource` instance that imports a memory
+        pool from a shareable handle. The memory pool is marked as owned.
+
+        Parameters
+        ----------
+        alloc_handle : int | IPCAllocationHandle
+            The shareable handle of the managed memory resource to import. If an
+            integer is supplied, it must represent a valid platform-specific
+            handle. It is the caller's responsibility to close that handle.
+
+        Returns
+        -------
+            A new managed memory resource instance with the imported handle.
+        """
+        cdef ManagedMemoryResource mr = <ManagedMemoryResource>(
+            _ipc.MP_from_allocation_handle(cls, alloc_handle))
+        return mr
+
+    def get_allocation_handle(self) -> IPCAllocationHandle:
+        """Export the memory pool handle to be shared (requires IPC).
+
+        The handle can be used to share the memory pool with other processes.
+        The handle is cached in this `MemoryResource` and owned by it.
+
+        Returns
+        -------
+            The shareable handle for the memory pool.
+
+        Raises
+        ------
+        RuntimeError
+            IPC is not currently supported for managed memory pools.
+        """
+        raise RuntimeError("IPC is not currently supported for managed memory pools")
+
+    @property
+    def is_device_accessible(self) -> bool:
+        """Return True. This memory resource provides device-accessible buffers."""
+        return True
+
+    @property
+    def is_host_accessible(self) -> bool:
+        """Return True. This memory resource provides host-accessible buffers."""
+        return True
+
+    @property
+    def is_ipc_enabled(self) -> bool:
+        """Whether this memory resource has IPC enabled."""
+        return self._ipc_data is not None
+
+    @property
+    def is_mapped(self) -> bool:
+        """
+        Whether this is a mapping of an IPC-enabled memory resource from
+        another process.  If True, allocation is not permitted.
+        """
+        return self._ipc_data is not None and self._ipc_data._is_mapped
+
+    @property
+    def uuid(self) -> Optional[uuid.UUID]:
+        """
+        A universally unique identifier for this memory resource. Meaningful
+        only for IPC-enabled memory resources.
+        """
+        return getattr(self._ipc_data, 'uuid', None)
+
+
+def _deep_reduce_managed_memory_resource(mr):
+    raise RuntimeError("IPC is not currently supported for managed memory pools")
+
+
+# Multiprocessing support disabled until IPC is supported for managed memory pools
+# multiprocessing.reduction.register(ManagedMemoryResource, _deep_reduce_managed_memory_resource)
diff --git a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
index b4e86372dd..5ea88f2944 100644
--- a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
@@ -325,8 +325,14 @@ cdef int _MP_init_current(_MemPool self, int dev_id, _MemPoolOptions opts) excep
                 HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type))
             ELSE:
                 raise RuntimeError("not supported")
-        #TODO
-        #elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED
+        elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED:
+            # Managed memory pools
+            IF CUDA_CORE_BUILD_MAJOR >= 13:
+                loc.id = dev_id
+                loc.type = opts._location
+                HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type))
+            ELSE:
+                raise RuntimeError("Managed memory pools not supported in CUDA < 13")
         else:
             assert False
 
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 36b6dc4b32..5e1b401d40 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -17,6 +17,8 @@
     Device,
     DeviceMemoryResource,
     DeviceMemoryResourceOptions,
+    ManagedMemoryResource,
+    ManagedMemoryResourceOptions,
     PinnedMemoryResource,
     PinnedMemoryResourceOptions,
     _device,
@@ -165,6 +167,7 @@ def mempool_device_x3():
         pytest.param((DeviceMemoryResource, DeviceMemoryResourceOptions, True), id="DeviceMR-device_object"),
         pytest.param((DeviceMemoryResource, DeviceMemoryResourceOptions, False), id="DeviceMR-device_id"),
         pytest.param((PinnedMemoryResource, PinnedMemoryResourceOptions, None), id="PinnedMR"),
+        pytest.param((ManagedMemoryResource, ManagedMemoryResourceOptions, None), id="ManagedMR"),
     ]
 )
 def memory_resource_factory_with_device(request, init_cuda):
@@ -172,8 +175,8 @@ def memory_resource_factory_with_device(request, init_cuda):
 
     Returns a 3-tuple of (MRClass, MROptionClass, use_device_object).
     For DeviceMemoryResource, use_device_object is True/False indicating whether to pass
-    a Device object or device_id. For PinnedMemoryResource, use_device_object is None
-    as it doesn't require a device parameter.
+    a Device object or device_id. For PinnedMemoryResource and ManagedMemoryResource,
+    use_device_object is None as they don't require a device parameter.
 
     Usage:
         def test_something(memory_resource_factory_with_device):
@@ -184,6 +187,8 @@ def test_something(memory_resource_factory_with_device):
                 mr = MRClass(device_arg)
             elif MRClass is PinnedMemoryResource:
                 mr = MRClass()
+            elif MRClass is ManagedMemoryResource:
+                mr = MRClass()
     """
     return request.param
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index c990405894..66d160eb44 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -23,6 +23,7 @@
     DeviceMemoryResource,
     DeviceMemoryResourceOptions,
     GraphMemoryResource,
+    ManagedMemoryResource,
     MemoryResource,
     PinnedMemoryResource,
     VirtualMemoryResource,
@@ -133,6 +134,8 @@ def test_package_contents():
         "IPCBufferDescriptor",
         "IPCAllocationHandle",
         "LegacyPinnedMemoryResource",
+        "ManagedMemoryResource",
+        "ManagedMemoryResourceOptions",
         "PinnedMemoryResourceOptions",
         "PinnedMemoryResource",
         "VirtualMemoryResourceOptions",
@@ -523,6 +526,8 @@ def test_modern_device_memory_resource_initialization(memory_resource_factory_wi
         pytest.skip("Device does not support mempool operations")
     elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
         pytest.skip("Device does not support host mempool operations")
+    elif MR is ManagedMemoryResource and not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
 
     device.set_current()
 
@@ -539,6 +544,11 @@ def test_modern_device_memory_resource_initialization(memory_resource_factory_wi
         mr = PinnedMemoryResource()
         assert mr.is_device_accessible
         assert mr.is_host_accessible
+    elif MR is ManagedMemoryResource:
+        mr = ManagedMemoryResource()
+        assert mr.is_device_accessible
+        assert mr.is_host_accessible
+        assert mr.device_id == -1  # Default preferred location is CPU
 
     # Test allocation/deallocation works
     buffer = mr.allocate(1024)
@@ -549,6 +559,9 @@ def test_modern_device_memory_resource_initialization(memory_resource_factory_wi
     elif MR is PinnedMemoryResource:
         assert buffer.device_id == -1  # Not bound to any GPU
         assert buffer.is_host_accessible
+    elif MR is ManagedMemoryResource:
+        assert buffer.device_id == -1  # Managed memory with CPU preference
+        assert buffer.is_host_accessible  # But accessible from host
     assert buffer.memory_resource == mr
     assert buffer.is_device_accessible
     buffer.close()
@@ -764,6 +777,8 @@ def test_modern_memory_resources(memory_resource_factory_with_device):
         pytest.skip("Device does not support mempool operations")
     elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
         pytest.skip("Device does not support host mempool operations")
+    elif MR is ManagedMemoryResource and not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
 
     device.set_current()
 
@@ -781,6 +796,12 @@ def test_modern_memory_resources(memory_resource_factory_with_device):
         assert mr.is_device_accessible
         assert mr.is_host_accessible
         assert not mr.is_ipc_enabled
+    elif MR is ManagedMemoryResource:
+        mr = MR(options)
+        assert mr.device_id == -1  # Default preferred location is CPU
+        assert mr.is_device_accessible
+        assert mr.is_host_accessible
+        assert not mr.is_ipc_enabled
 
     # Test allocation and deallocation
     buffer1 = mr.allocate(1024)
@@ -850,7 +871,7 @@ def test_mempool_ipc_errors(mempool_device):
     ],
 )
 def test_mempool_attributes(ipc_enabled, memory_resource_factory_with_device, property_name, expected_type):
-    """Test all properties of memory pool attributes for DeviceMemoryResource and PinnedMemoryResource."""
+    """Test all properties of memory pool attributes for all memory resource types."""
     MR, MRops, _ = memory_resource_factory_with_device
     device = Device()
 
@@ -858,10 +879,12 @@ def test_mempool_attributes(ipc_enabled, memory_resource_factory_with_device, pr
         pytest.skip("Device does not support mempool operations")
     elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
         pytest.skip("Device does not support host mempool operations")
+    elif MR is ManagedMemoryResource and not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
 
-    # PinnedMemoryResource does not support IPC
-    if MR is PinnedMemoryResource and ipc_enabled:
-        pytest.skip("PinnedMemoryResource does not support IPC")
+    # PinnedMemoryResource and ManagedMemoryResource do not support IPC
+    if (MR is PinnedMemoryResource or MR is ManagedMemoryResource) and ipc_enabled:
+        pytest.skip(f"{MR.__name__} does not support IPC")
 
     device.set_current()
 
@@ -875,7 +898,7 @@ def test_mempool_attributes(ipc_enabled, memory_resource_factory_with_device, pr
         options = MRops(max_size=POOL_SIZE, ipc_enabled=ipc_enabled)
         mr = MR(device, options=options)
         assert mr.is_ipc_enabled == ipc_enabled
-    elif MR is PinnedMemoryResource:
+    elif MR is PinnedMemoryResource or MR is ManagedMemoryResource:
         options = MRops(max_size=POOL_SIZE)
         mr = MR(options)
         assert not mr.is_ipc_enabled
@@ -912,7 +935,7 @@ def test_mempool_attributes(ipc_enabled, memory_resource_factory_with_device, pr
 
 
 def test_mempool_attributes_repr(memory_resource_factory_with_device):
-    """Test the repr of memory pool attributes for both DeviceMemoryResource and PinnedMemoryResource."""
+    """Test the repr of memory pool attributes for all memory resource types."""
     MR, MRops, _ = memory_resource_factory_with_device
     device = Device()
 
@@ -920,12 +943,14 @@ def test_mempool_attributes_repr(memory_resource_factory_with_device):
         pytest.skip("Device does not support mempool operations")
     elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
         pytest.skip("Device does not support host mempool operations")
+    elif MR is ManagedMemoryResource and not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
 
     device.set_current()
 
     if MR is DeviceMemoryResource:
         mr = MR(device, options={"max_size": 2048})
-    elif MR is PinnedMemoryResource:
+    elif MR is PinnedMemoryResource or MR is ManagedMemoryResource:
         mr = MR(options={"max_size": 2048})
 
     buffer1 = mr.allocate(64)
@@ -941,7 +966,7 @@ def test_mempool_attributes_repr(memory_resource_factory_with_device):
 
 
 def test_mempool_attributes_ownership(memory_resource_factory_with_device):
-    """Ensure the attributes bundle handles references correctly for both memory resource types."""
+    """Ensure the attributes bundle handles references correctly for all memory resource types."""
     MR, MRops, _ = memory_resource_factory_with_device
     device = Device()
 
@@ -949,6 +974,8 @@ def test_mempool_attributes_ownership(memory_resource_factory_with_device):
         pytest.skip("Device does not support mempool operations")
     elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
         pytest.skip("Device does not support host mempool operations")
+    elif MR is ManagedMemoryResource and not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
 
     # Skip if IPC mempool is not supported on this platform/device (only relevant for DeviceMemoryResource)
     if MR is DeviceMemoryResource and not supports_ipc_mempool(device):
@@ -958,7 +985,7 @@ def test_mempool_attributes_ownership(memory_resource_factory_with_device):
 
     if MR is DeviceMemoryResource:
         mr = MR(device, dict(max_size=POOL_SIZE))
-    elif MR is PinnedMemoryResource:
+    elif MR is PinnedMemoryResource or MR is ManagedMemoryResource:
         mr = MR(dict(max_size=POOL_SIZE))
 
     attributes = mr.attributes
@@ -973,7 +1000,7 @@ def test_mempool_attributes_ownership(memory_resource_factory_with_device):
     # mempool handle was really reused).
     if MR is DeviceMemoryResource:
         mr = MR(device, dict(max_size=POOL_SIZE))  # noqa: F841
-    elif MR is PinnedMemoryResource:
+    elif MR is PinnedMemoryResource or MR is ManagedMemoryResource:
         mr = MR(dict(max_size=POOL_SIZE))  # noqa: F841
 
     with pytest.raises(RuntimeError, match="is expired"):

From 28138b97f033ac100f1acc5f838869dc88c174df Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 12 Dec 2025 04:23:10 +0000
Subject: [PATCH 04/14] fix MMR bug + refactor tests + enable IPC for PMR

---
 .../core/experimental/_memory/_buffer.pyx     |   5 +-
 .../_memory/_managed_memory_resource.pyx      |   6 -
 .../experimental/_memory/_memory_pool.pyx     |   9 +
 .../_memory/_pinned_memory_resource.pyx       |  84 ++++-
 cuda_core/tests/conftest.py                   |  25 +-
 cuda_core/tests/test_memory.py                | 324 ++++++++++++++----
 6 files changed, 346 insertions(+), 107 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
index b26471ed0e..8f4ac46051 100644
--- a/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_buffer.pyx
@@ -8,7 +8,8 @@ cimport cython
 from libc.stdint cimport uintptr_t, int64_t, uint64_t
 
 from cuda.bindings cimport cydriver
-from cuda.core.experimental._memory._device_memory_resource cimport DeviceMemoryResource
+from cuda.core.experimental._memory._device_memory_resource import DeviceMemoryResource
+from cuda.core.experimental._memory._pinned_memory_resource import PinnedMemoryResource
 from cuda.core.experimental._memory._ipc cimport IPCBufferDescriptor, IPCDataForBuffer
 from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._stream cimport Stream_accept, Stream
@@ -106,7 +107,7 @@ cdef class Buffer:
 
     @classmethod
     def from_ipc_descriptor(
-        cls, mr: DeviceMemoryResource, ipc_descriptor: IPCBufferDescriptor,
+        cls, mr: DeviceMemoryResource | PinnedMemoryResource, ipc_descriptor: IPCBufferDescriptor,
         stream: Stream = None
     ) -> Buffer:
         """Import a buffer that was exported from another process."""
diff --git a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
index 0b74833054..b3f98f59bb 100644
--- a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
@@ -29,13 +29,8 @@ cdef class ManagedMemoryResourceOptions:
         The preferred device location for the managed memory.
         Use a device ID (0, 1, 2, ...) for device preference, or -1 for CPU/host.
         (Default to -1 for CPU/host)
-
-    max_size : int, optional
-        Maximum pool size. When set to 0, defaults to a system-dependent value.
-        (Default to 0)
     """
     preferred_location : int = -1
-    max_size : int = 0
 
 
 cdef class ManagedMemoryResource(_MemPool):
@@ -77,7 +72,6 @@ cdef class ManagedMemoryResource(_MemPool):
         cdef int device_id = -1  # Default: CPU/host preference
         if opts:
             device_id = opts.preferred_location
-            opts_base._max_size = opts.max_size
             opts_base._use_current = False
 
         opts_base._ipc_enabled = False  # IPC not supported for managed memory pools
diff --git a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
index 5ea88f2944..5a6c240b09 100644
--- a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
@@ -325,6 +325,15 @@ cdef int _MP_init_current(_MemPool self, int dev_id, _MemPoolOptions opts) excep
                 HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type))
             ELSE:
                 raise RuntimeError("not supported")
+        elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED \
+                and opts._location == cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA:
+            IF CUDA_CORE_BUILD_MAJOR >= 13:
+                assert dev_id == 0
+                loc.id = 0
+                loc.type = opts._location
+                HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type))
+            ELSE:
+                raise RuntimeError("not supported")
         elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED:
             # Managed memory pools
             IF CUDA_CORE_BUILD_MAJOR >= 13:
diff --git a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
index 799bf90a90..4a18a0a43c 100644
--- a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
@@ -14,8 +14,42 @@ from cuda.core.experimental._utils.cuda_utils cimport (
 
 from dataclasses import dataclass
 from typing import Optional
+import multiprocessing
 import platform  # no-cython-lint
+import subprocess
 import uuid
+import warnings
+
+from cuda.core.experimental._utils.cuda_utils import check_multiprocessing_start_method
+
+
+def _check_numa_nodes():
+    """Check if system has multiple NUMA nodes and warn if so."""
+    if platform.system() != "Linux":
+        return
+
+    try:
+        result = subprocess.run(
+            ["lscpu"],
+            capture_output=True,
+            text=True,
+            timeout=1
+        )
+        for line in result.stdout.splitlines():
+            if line.startswith("NUMA node(s):"):
+                numa_count = int(line.split(":")[1].strip())
+                if numa_count > 1:
+                    warnings.warn(
+                        f"System has {numa_count} NUMA nodes. IPC-enabled pinned memory "
+                        f"uses location ID 0, which may not work correctly with multiple "
+                        f"NUMA nodes.",
+                        UserWarning,
+                        stacklevel=3
+                    )
+                break
+    except (subprocess.SubprocessError, ValueError, FileNotFoundError):
+        # If we can't check, don't warn
+        pass
 
 
 __all__ = ['PinnedMemoryResource', 'PinnedMemoryResourceOptions']
@@ -27,10 +61,16 @@ cdef class PinnedMemoryResourceOptions:
 
     Attributes
     ----------
+    ipc_enabled : bool, optional
+        Specifies whether to create an IPC-enabled memory pool. When set to
+        True, the memory pool and its allocations can be shared with other
+        processes. (Default to False)
+
     max_size : int, optional
         Maximum pool size. When set to 0, defaults to a system-dependent value.
         (Default to 0)
     """
+    ipc_enabled : bool = False
     max_size : int = 0
 
 
@@ -57,8 +97,16 @@ cdef class PinnedMemoryResource(_MemPool):
 
     Notes
     -----
-    IPC (Inter-Process Communication) is not currently supported for pinned
-    memory pools.
+    To create an IPC-Enabled memory resource (MR) that is capable of sharing
+    allocations between processes, specify ``ipc_enabled=True`` in the initializer
+    option. When IPC is enabled, the location type is automatically set to
+    CU_MEM_LOCATION_TYPE_HOST_NUMA instead of CU_MEM_LOCATION_TYPE_HOST,
+    with location ID 0.
+
+    Note: IPC support for pinned memory requires a single NUMA node. A warning
+    is issued if multiple NUMA nodes are detected.
+
+    See :class:`DeviceMemoryResource` for more details on IPC usage patterns.
     """
 
     def __init__(self, options=None):
@@ -68,14 +116,24 @@ cdef class PinnedMemoryResource(_MemPool):
         )
         cdef _MemPoolOptions opts_base = _MemPoolOptions()
 
+        cdef bint ipc_enabled = False
         if opts:
+            ipc_enabled = opts.ipc_enabled
+            if ipc_enabled and not _ipc.is_supported():
+                raise RuntimeError(f"IPC is not available on {platform.system()}")
+            if ipc_enabled:
+                # Check for multiple NUMA nodes on Linux
+                _check_numa_nodes()
             opts_base._max_size = opts.max_size
             opts_base._use_current = False
-        opts_base._ipc_enabled = False  # IPC not supported for pinned memory pools
-        opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
+        opts_base._ipc_enabled = ipc_enabled
+        if ipc_enabled:
+            opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST_NUMA
+        else:
+            opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
         opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_PINNED
 
-        super().__init__(-1, opts_base)
+        super().__init__(0 if ipc_enabled else -1, opts_base)
 
     def __reduce__(self):
         return PinnedMemoryResource.from_registry, (self.uuid,)
@@ -136,13 +194,10 @@ cdef class PinnedMemoryResource(_MemPool):
         Returns
         -------
             The shareable handle for the memory pool.
-
-        Raises
-        ------
-        RuntimeError
-            IPC is not currently supported for pinned memory pools.
         """
-        raise RuntimeError("IPC is not currently supported for pinned memory pools")
+        if not self.is_ipc_enabled:
+            raise RuntimeError("Memory resource is not IPC-enabled")
+        return self._ipc_data._alloc_handle
 
     @property
     def is_device_accessible(self) -> bool:
@@ -177,8 +232,9 @@ cdef class PinnedMemoryResource(_MemPool):
 
 
 def _deep_reduce_pinned_memory_resource(mr):
-    raise RuntimeError("IPC is not currently supported for pinned memory pools")
+    check_multiprocessing_start_method()
+    alloc_handle = mr.get_allocation_handle()
+    return mr.from_allocation_handle, (alloc_handle,)
 
 
-# Multiprocessing support disabled until IPC is supported for pinned memory pools
-# multiprocessing.reduction.register(PinnedMemoryResource, _deep_reduce_pinned_memory_resource)
+multiprocessing.reduction.register(PinnedMemoryResource, _deep_reduce_pinned_memory_resource)
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 5e1b401d40..ce57ef237a 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -164,27 +164,22 @@ def mempool_device_x3():
 
 @pytest.fixture(
     params=[
-        pytest.param((DeviceMemoryResource, DeviceMemoryResourceOptions, True), id="DeviceMR-device_object"),
-        pytest.param((DeviceMemoryResource, DeviceMemoryResourceOptions, False), id="DeviceMR-device_id"),
-        pytest.param((PinnedMemoryResource, PinnedMemoryResourceOptions, None), id="PinnedMR"),
-        pytest.param((ManagedMemoryResource, ManagedMemoryResourceOptions, None), id="ManagedMR"),
+        pytest.param((DeviceMemoryResource, DeviceMemoryResourceOptions), id="DeviceMR"),
+        pytest.param((PinnedMemoryResource, PinnedMemoryResourceOptions), id="PinnedMR"),
+        pytest.param((ManagedMemoryResource, ManagedMemoryResourceOptions), id="ManagedMR"),
     ]
 )
-def memory_resource_factory_with_device(request, init_cuda):
-    """Parametrized fixture providing memory resource types with device usage configuration.
+def memory_resource_factory(request, init_cuda):
+    """Parametrized fixture providing memory resource types.
 
-    Returns a 3-tuple of (MRClass, MROptionClass, use_device_object).
-    For DeviceMemoryResource, use_device_object is True/False indicating whether to pass
-    a Device object or device_id. For PinnedMemoryResource and ManagedMemoryResource,
-    use_device_object is None as they don't require a device parameter.
+    Returns a 2-tuple of (MRClass, MROptionClass).
 
     Usage:
-        def test_something(memory_resource_factory_with_device):
-            MRClass, MROptions, use_device_object = memory_resource_factory_with_device
-            device = Device(0)
+        def test_something(memory_resource_factory):
+            MRClass, MROptions = memory_resource_factory
+            device = Device()
             if MRClass is DeviceMemoryResource:
-                device_arg = device if use_device_object else device.device_id
-                mr = MRClass(device_arg)
+                mr = MRClass(device)
             elif MRClass is PinnedMemoryResource:
                 mr = MRClass()
             elif MRClass is ManagedMemoryResource:
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 66d160eb44..a36ee3905e 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -24,8 +24,10 @@
     DeviceMemoryResourceOptions,
     GraphMemoryResource,
     ManagedMemoryResource,
+    ManagedMemoryResourceOptions,
     MemoryResource,
     PinnedMemoryResource,
+    PinnedMemoryResourceOptions,
     VirtualMemoryResource,
     VirtualMemoryResourceOptions,
 )
@@ -518,50 +520,76 @@ def test_buffer_dlpack_failure_clean_up():
     assert after == before
 
 
-def test_modern_device_memory_resource_initialization(memory_resource_factory_with_device):
+@pytest.mark.parametrize("use_device_object", [True, False])
+def test_device_memory_resource_initialization(use_device_object):
+    """Test that DeviceMemoryResource can be initialized successfully.
+
+    This test verifies that the DeviceMemoryResource initializes properly,
+    including the release threshold configuration for performance optimization.
+    """
     device = Device()
-    MR, MRops, use_device_object = memory_resource_factory_with_device
 
-    if MR is DeviceMemoryResource and not device.properties.memory_pools_supported:
+    if not device.properties.memory_pools_supported:
         pytest.skip("Device does not support mempool operations")
-    elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
+
+    device.set_current()
+
+    # This should succeed and configure the memory pool release threshold.
+    # The resource can be constructed from either a device or device ordinal.
+    device_arg = device if use_device_object else device.device_id
+    mr = DeviceMemoryResource(device_arg)
+
+    # Verify basic properties
+    assert mr.device_id == device.device_id
+    assert mr.is_device_accessible
+    assert not mr.is_host_accessible
+    assert not mr.is_ipc_enabled
+
+    # Test allocation/deallocation works
+    buffer = mr.allocate(1024)
+    assert buffer.size == 1024
+    assert buffer.device_id == device.device_id
+    buffer.close()
+
+
+def test_pinned_memory_resource_initialization(init_cuda):
+    device = Device()
+    if not device.properties.host_memory_pools_supported:
         pytest.skip("Device does not support host mempool operations")
-    elif MR is ManagedMemoryResource and not device.properties.memory_pools_supported:
+
+    device.set_current()
+
+    mr = PinnedMemoryResource()
+    assert mr.is_device_accessible
+    assert mr.is_host_accessible
+
+    # Test allocation/deallocation works
+    buffer = mr.allocate(1024)
+    assert buffer.size == 1024
+    assert buffer.device_id == -1  # Not bound to any GPU
+    assert buffer.is_host_accessible
+    assert buffer.memory_resource == mr
+    assert buffer.is_device_accessible
+    buffer.close()
+
+
+def test_managed_memory_resource_initialization(init_cuda):
+    device = Device()
+    if not device.properties.memory_pools_supported:
         pytest.skip("Device does not support mempool operations")
 
     device.set_current()
 
-    if MR is DeviceMemoryResource:
-        # This should succeed and configure the memory pool release threshold.
-        # The resource can be constructed from either a device or device ordinal.
-        device_arg = device if use_device_object else device.device_id
-        mr = MR(device_arg)
-        assert mr.device_id == device.device_id
-        assert mr.is_device_accessible
-        assert not mr.is_host_accessible
-        assert not mr.is_ipc_enabled
-    elif MR is PinnedMemoryResource:
-        mr = PinnedMemoryResource()
-        assert mr.is_device_accessible
-        assert mr.is_host_accessible
-    elif MR is ManagedMemoryResource:
-        mr = ManagedMemoryResource()
-        assert mr.is_device_accessible
-        assert mr.is_host_accessible
-        assert mr.device_id == -1  # Default preferred location is CPU
+    mr = ManagedMemoryResource()
+    assert mr.is_device_accessible
+    assert mr.is_host_accessible
+    assert mr.device_id == -1  # Default preferred location is CPU
 
     # Test allocation/deallocation works
     buffer = mr.allocate(1024)
     assert buffer.size == 1024
-    if MR is DeviceMemoryResource:
-        assert buffer.device_id == device.device_id
-        assert not buffer.is_host_accessible
-    elif MR is PinnedMemoryResource:
-        assert buffer.device_id == -1  # Not bound to any GPU
-        assert buffer.is_host_accessible
-    elif MR is ManagedMemoryResource:
-        assert buffer.device_id == -1  # Managed memory with CPU preference
-        assert buffer.is_host_accessible  # But accessible from host
+    assert buffer.device_id == -1  # Managed memory with CPU preference
+    assert buffer.is_host_accessible  # But accessible from host
     assert buffer.memory_resource == mr
     assert buffer.is_device_accessible
     buffer.close()
@@ -768,40 +796,114 @@ def test_vmm_allocator_rdma_unsupported_exception():
         VirtualMemoryResource(device, config=options)
 
 
-def test_modern_memory_resources(memory_resource_factory_with_device):
+def test_device_memory_resource_with_options(init_cuda):
     device = Device()
+    if not device.properties.memory_pools_supported:
+        pytest.skip("Device does not support mempool operations")
 
-    MR, MRops, _ = memory_resource_factory_with_device
+    device.set_current()
 
-    if MR is DeviceMemoryResource and not device.properties.memory_pools_supported:
-        pytest.skip("Device does not support mempool operations")
-    elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
+    # Test basic pool creation
+    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE)
+    mr = DeviceMemoryResource(device, options=options)
+    assert mr.device_id == device.device_id
+    assert mr.is_device_accessible
+    assert not mr.is_host_accessible
+    assert not mr.is_ipc_enabled
+
+    # Test allocation and deallocation
+    buffer1 = mr.allocate(1024)
+    assert buffer1.handle != 0
+    assert buffer1.size == 1024
+    assert buffer1.memory_resource == mr
+    buffer1.close()
+
+    # Test multiple allocations
+    buffer1 = mr.allocate(1024)
+    buffer2 = mr.allocate(2048)
+    assert buffer1.handle != buffer2.handle
+    assert buffer1.size == 1024
+    assert buffer2.size == 2048
+    buffer1.close()
+    buffer2.close()
+
+    # Test stream-based allocation
+    stream = device.create_stream()
+    buffer = mr.allocate(1024, stream=stream)
+    assert buffer.handle != 0
+    buffer.close(stream)
+
+    # Test memory copying between buffers from same pool
+    src_buffer = mr.allocate(64)
+    dst_buffer = mr.allocate(64)
+    stream = device.create_stream()
+    src_buffer.copy_to(dst_buffer, stream=stream)
+    device.sync()
+    dst_buffer.close()
+    src_buffer.close()
+
+
+def test_pinned_memory_resource_with_options(init_cuda):
+    device = Device()
+    if not device.properties.host_memory_pools_supported:
         pytest.skip("Device does not support host mempool operations")
-    elif MR is ManagedMemoryResource and not device.properties.memory_pools_supported:
+
+    device.set_current()
+
+    # Test basic pool creation
+    options = PinnedMemoryResourceOptions(max_size=POOL_SIZE)
+    mr = PinnedMemoryResource(options)
+    assert mr.device_id == -1  # Not bound to any GPU
+    assert mr.is_device_accessible
+    assert mr.is_host_accessible
+    assert not mr.is_ipc_enabled
+
+    # Test allocation and deallocation
+    buffer1 = mr.allocate(1024)
+    assert buffer1.handle != 0
+    assert buffer1.size == 1024
+    assert buffer1.memory_resource == mr
+    buffer1.close()
+
+    # Test multiple allocations
+    buffer1 = mr.allocate(1024)
+    buffer2 = mr.allocate(2048)
+    assert buffer1.handle != buffer2.handle
+    assert buffer1.size == 1024
+    assert buffer2.size == 2048
+    buffer1.close()
+    buffer2.close()
+
+    # Test stream-based allocation
+    stream = device.create_stream()
+    buffer = mr.allocate(1024, stream=stream)
+    assert buffer.handle != 0
+    buffer.close(stream)
+
+    # Test memory copying between buffers from same pool
+    src_buffer = mr.allocate(64)
+    dst_buffer = mr.allocate(64)
+    stream = device.create_stream()
+    src_buffer.copy_to(dst_buffer, stream=stream)
+    device.sync()
+    dst_buffer.close()
+    src_buffer.close()
+
+
+def test_managed_memory_resource_with_options(init_cuda):
+    device = Device()
+    if not device.properties.memory_pools_supported:
         pytest.skip("Device does not support mempool operations")
 
     device.set_current()
 
     # Test basic pool creation
-    options = MRops(max_size=POOL_SIZE)
-    if MR is DeviceMemoryResource:
-        mr = MR(device, options=options)
-        assert mr.device_id == device.device_id
-        assert mr.is_device_accessible
-        assert not mr.is_host_accessible
-        assert not mr.is_ipc_enabled
-    elif MR is PinnedMemoryResource:
-        mr = MR(options)
-        assert mr.device_id == -1  # Not bound to any GPU
-        assert mr.is_device_accessible
-        assert mr.is_host_accessible
-        assert not mr.is_ipc_enabled
-    elif MR is ManagedMemoryResource:
-        mr = MR(options)
-        assert mr.device_id == -1  # Default preferred location is CPU
-        assert mr.is_device_accessible
-        assert mr.is_host_accessible
-        assert not mr.is_ipc_enabled
+    options = ManagedMemoryResourceOptions()
+    mr = ManagedMemoryResource(options)
+    assert mr.device_id == -1  # Default preferred location is CPU
+    assert mr.is_device_accessible
+    assert mr.is_host_accessible
+    assert not mr.is_ipc_enabled
 
     # Test allocation and deallocation
     buffer1 = mr.allocate(1024)
@@ -856,6 +958,78 @@ def test_mempool_ipc_errors(mempool_device):
     buffer.close()
 
 
+def test_pinned_mempool_ipc_basic():
+    """Test basic IPC functionality for PinnedMemoryResource."""
+    device = Device()
+    device.set_current()
+
+    if not device.properties.host_memory_pools_supported:
+        pytest.skip("Device does not support host mempool operations")
+
+    if platform.system() == "Windows":
+        pytest.skip("IPC not implemented for Windows")
+
+    if not supports_ipc_mempool(device):
+        pytest.skip("Driver rejects IPC-enabled mempool creation on this platform")
+
+    # Test IPC-enabled PinnedMemoryResource creation
+    options = PinnedMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
+    mr = PinnedMemoryResource(options)
+    assert mr.is_ipc_enabled
+    assert mr.is_device_accessible
+    assert mr.is_host_accessible
+    assert mr.device_id == 0  # IPC-enabled uses location id 0
+
+    # Test allocation handle export
+    alloc_handle = mr.get_allocation_handle()
+    assert alloc_handle is not None
+
+    # Test buffer allocation
+    buffer = mr.allocate(1024)
+    assert buffer.size == 1024
+    assert buffer.is_device_accessible
+    assert buffer.is_host_accessible
+
+    # Test IPC descriptor
+    ipc_desc = buffer.get_ipc_descriptor()
+    assert ipc_desc is not None
+    assert ipc_desc.size == 1024
+
+    buffer.close()
+    mr.close()
+
+
+def test_pinned_mempool_ipc_errors():
+    """Test error cases when IPC operations are disabled for PinnedMemoryResource."""
+    device = Device()
+    device.set_current()
+
+    if not device.properties.host_memory_pools_supported:
+        pytest.skip("Device does not support host mempool operations")
+
+    # Test with IPC disabled (default)
+    options = PinnedMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False)
+    mr = PinnedMemoryResource(options)
+    assert not mr.is_ipc_enabled
+    assert mr.device_id == -1  # Non-IPC uses location id -1
+
+    buffer = mr.allocate(64)
+    ipc_error_msg = "Memory resource is not IPC-enabled"
+
+    with pytest.raises(RuntimeError, match=ipc_error_msg):
+        mr.get_allocation_handle()
+
+    with pytest.raises(RuntimeError, match=ipc_error_msg):
+        buffer.get_ipc_descriptor()
+
+    with pytest.raises(RuntimeError, match=ipc_error_msg):
+        handle = IPCBufferDescriptor._init(b"", 0)
+        Buffer.from_ipc_descriptor(mr, handle)
+
+    buffer.close()
+    mr.close()
+
+
 @pytest.mark.parametrize("ipc_enabled", [True, False])
 @pytest.mark.parametrize(
     "property_name,expected_type",
@@ -870,9 +1044,9 @@ def test_mempool_ipc_errors(mempool_device):
         ("used_mem_high", int),
     ],
 )
-def test_mempool_attributes(ipc_enabled, memory_resource_factory_with_device, property_name, expected_type):
+def test_mempool_attributes(ipc_enabled, memory_resource_factory, property_name, expected_type):
     """Test all properties of memory pool attributes for all memory resource types."""
-    MR, MRops, _ = memory_resource_factory_with_device
+    MR, MRops = memory_resource_factory
     device = Device()
 
     if MR is DeviceMemoryResource and not device.properties.memory_pools_supported:
@@ -882,8 +1056,8 @@ def test_mempool_attributes(ipc_enabled, memory_resource_factory_with_device, pr
     elif MR is ManagedMemoryResource and not device.properties.memory_pools_supported:
         pytest.skip("Device does not support mempool operations")
 
-    # PinnedMemoryResource and ManagedMemoryResource do not support IPC
-    if (MR is PinnedMemoryResource or MR is ManagedMemoryResource) and ipc_enabled:
+    # ManagedMemoryResource does not support IPC
+    if MR is ManagedMemoryResource and ipc_enabled:
         pytest.skip(f"{MR.__name__} does not support IPC")
 
     device.set_current()
@@ -898,8 +1072,12 @@ def test_mempool_attributes(ipc_enabled, memory_resource_factory_with_device, pr
         options = MRops(max_size=POOL_SIZE, ipc_enabled=ipc_enabled)
         mr = MR(device, options=options)
         assert mr.is_ipc_enabled == ipc_enabled
-    elif MR is PinnedMemoryResource or MR is ManagedMemoryResource:
-        options = MRops(max_size=POOL_SIZE)
+    elif MR is PinnedMemoryResource:
+        options = MRops(max_size=POOL_SIZE, ipc_enabled=ipc_enabled)
+        mr = MR(options)
+        assert mr.is_ipc_enabled == ipc_enabled
+    elif MR is ManagedMemoryResource:
+        options = MRops()
         mr = MR(options)
         assert not mr.is_ipc_enabled
 
@@ -934,9 +1112,9 @@ def test_mempool_attributes(ipc_enabled, memory_resource_factory_with_device, pr
         assert value >= current_value, f"{property_name} should be >= {current_prop}"
 
 
-def test_mempool_attributes_repr(memory_resource_factory_with_device):
+def test_mempool_attributes_repr(memory_resource_factory):
     """Test the repr of memory pool attributes for all memory resource types."""
-    MR, MRops, _ = memory_resource_factory_with_device
+    MR, MRops = memory_resource_factory
     device = Device()
 
     if MR is DeviceMemoryResource and not device.properties.memory_pools_supported:
@@ -950,8 +1128,10 @@ def test_mempool_attributes_repr(memory_resource_factory_with_device):
 
     if MR is DeviceMemoryResource:
         mr = MR(device, options={"max_size": 2048})
-    elif MR is PinnedMemoryResource or MR is ManagedMemoryResource:
+    elif MR is PinnedMemoryResource:
         mr = MR(options={"max_size": 2048})
+    elif MR is ManagedMemoryResource:
+        mr = MR(options={})
 
     buffer1 = mr.allocate(64)
     buffer2 = mr.allocate(64)
@@ -965,9 +1145,9 @@ def test_mempool_attributes_repr(memory_resource_factory_with_device):
     buffer2.close()
 
 
-def test_mempool_attributes_ownership(memory_resource_factory_with_device):
+def test_mempool_attributes_ownership(memory_resource_factory):
     """Ensure the attributes bundle handles references correctly for all memory resource types."""
-    MR, MRops, _ = memory_resource_factory_with_device
+    MR, MRops = memory_resource_factory
     device = Device()
 
     if MR is DeviceMemoryResource and not device.properties.memory_pools_supported:
@@ -985,8 +1165,10 @@ def test_mempool_attributes_ownership(memory_resource_factory_with_device):
 
     if MR is DeviceMemoryResource:
         mr = MR(device, dict(max_size=POOL_SIZE))
-    elif MR is PinnedMemoryResource or MR is ManagedMemoryResource:
+    elif MR is PinnedMemoryResource:
         mr = MR(dict(max_size=POOL_SIZE))
+    elif MR is ManagedMemoryResource:
+        mr = MR(dict())
 
     attributes = mr.attributes
     mr.close()
@@ -1000,8 +1182,10 @@ def test_mempool_attributes_ownership(memory_resource_factory_with_device):
     # mempool handle was really reused).
     if MR is DeviceMemoryResource:
         mr = MR(device, dict(max_size=POOL_SIZE))  # noqa: F841
-    elif MR is PinnedMemoryResource or MR is ManagedMemoryResource:
+    elif MR is PinnedMemoryResource:
         mr = MR(dict(max_size=POOL_SIZE))  # noqa: F841
+    elif MR is ManagedMemoryResource:
+        mr = MR(dict())  # noqa: F841
 
     with pytest.raises(RuntimeError, match="is expired"):
         _ = attributes.used_mem_high

From 71c8b6320d38fee0959f48578774149c7f8a6df2 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 12 Dec 2025 04:30:42 +0000
Subject: [PATCH 05/14] make numa detection slightly better

---
 .../_memory/_pinned_memory_resource.pyx       | 56 ++++++++++++-------
 1 file changed, 36 insertions(+), 20 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
index 4a18a0a43c..20f2d1b1ad 100644
--- a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
@@ -28,29 +28,45 @@ def _check_numa_nodes():
     if platform.system() != "Linux":
         return
 
+    numa_count = None
+
+    # Try /sys filesystem first (most reliable and doesn't require external tools)
     try:
-        result = subprocess.run(
-            ["lscpu"],
-            capture_output=True,
-            text=True,
-            timeout=1
-        )
-        for line in result.stdout.splitlines():
-            if line.startswith("NUMA node(s):"):
-                numa_count = int(line.split(":")[1].strip())
-                if numa_count > 1:
-                    warnings.warn(
-                        f"System has {numa_count} NUMA nodes. IPC-enabled pinned memory "
-                        f"uses location ID 0, which may not work correctly with multiple "
-                        f"NUMA nodes.",
-                        UserWarning,
-                        stacklevel=3
-                    )
-                break
-    except (subprocess.SubprocessError, ValueError, FileNotFoundError):
-        # If we can't check, don't warn
+        import os
+        node_path = "/sys/devices/system/node"
+        if os.path.exists(node_path):
+            # Count directories named "node[0-9]+"
+            nodes = [d for d in os.listdir(node_path) if d.startswith("node") and d[4:].isdigit()]
+            numa_count = len(nodes)
+    except (OSError, PermissionError):
         pass
 
+    # Fallback to lscpu if /sys check didn't work
+    if numa_count is None:
+        try:
+            result = subprocess.run(
+                ["lscpu"],
+                capture_output=True,
+                text=True,
+                timeout=1
+            )
+            for line in result.stdout.splitlines():
+                if line.startswith("NUMA node(s):"):
+                    numa_count = int(line.split(":")[1].strip())
+                    break
+        except (subprocess.SubprocessError, ValueError, FileNotFoundError):
+            pass
+
+    # Warn if multiple NUMA nodes detected
+    if numa_count is not None and numa_count > 1:
+        warnings.warn(
+            f"System has {numa_count} NUMA nodes. IPC-enabled pinned memory "
+            f"uses location ID 0, which may not work correctly with multiple "
+            f"NUMA nodes.",
+            UserWarning,
+            stacklevel=3
+        )
+
 
 __all__ = ['PinnedMemoryResource', 'PinnedMemoryResourceOptions']
 

From 05b1e61e4759b2bd7532c68e7115dad6c6aed796 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 12 Dec 2025 06:16:02 +0000
Subject: [PATCH 06/14] enable PMR IPC tests

---
 .../_memory/_pinned_memory_resource.pyx       |  7 ++++++
 cuda_core/tests/conftest.py                   | 23 ++++++++++++++----
 cuda_core/tests/memory_ipc/test_event_ipc.py  |  9 +++++++
 cuda_core/tests/memory_ipc/test_serialize.py  | 24 ++++++++++++++++++-
 4 files changed, 58 insertions(+), 5 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
index 20f2d1b1ad..9c2a0b6834 100644
--- a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
@@ -197,6 +197,13 @@ cdef class PinnedMemoryResource(_MemPool):
         -------
             A new host-pinned memory resource instance with the imported handle.
         """
+        # TODO: Investigate if we need to initialize CUDA here. Currently required
+        # to avoid CUDA_ERROR_NOT_INITIALIZED in cuMemPoolImportFromShareableHandle.
+        # DMR doesn't explicitly do this, but it requires device_id parameter which
+        # may implicitly initialize CUDA. Need to find a cleaner solution.
+        from .._device import Device
+        Device(0).set_current()
+
         cdef PinnedMemoryResource mr = <PinnedMemoryResource>(
             _ipc.MP_from_allocation_handle(cls, alloc_handle))
         return mr
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index ce57ef237a..dca2d9c58a 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -111,11 +111,26 @@ def ipc_device():
     return device
 
 
-@pytest.fixture
-def ipc_memory_resource(ipc_device):
+@pytest.fixture(
+    params=[
+        pytest.param("device", id="DeviceMR"),
+        pytest.param("pinned", id="PinnedMR"),
+    ]
+)
+def ipc_memory_resource(request, ipc_device):
+    """Provides IPC-enabled memory resource (either Device or Pinned)."""
     POOL_SIZE = 2097152
-    options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
-    mr = DeviceMemoryResource(ipc_device, options=options)
+    mr_type = request.param
+
+    if mr_type == "device":
+        options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
+        mr = DeviceMemoryResource(ipc_device, options=options)
+    else:  # pinned
+        if not ipc_device.properties.host_memory_pools_supported:
+            pytest.skip("Device does not support host mempool operations")
+        options = PinnedMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
+        mr = PinnedMemoryResource(options=options)
+
     assert mr.is_ipc_enabled
     return mr
 
diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py
index ce756cba21..e69f8592fe 100644
--- a/cuda_core/tests/memory_ipc/test_event_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_event_ipc.py
@@ -18,6 +18,15 @@ class TestEventIpc:
     """Check the basic usage of IPC-enabled events with a latch kernel."""
 
     def test_main(self, ipc_device, ipc_memory_resource):
+        # TODO: This test currently fails with PinnedMemoryResource due to timeout
+        # in child process. The failure is likely unrelated to PMR itself since Event
+        # IPC is independent of memory resource type. Need to investigate the root cause.
+        # For now, skip PMR to avoid redundant testing since this is an Event IPC test.
+        from cuda.core.experimental import PinnedMemoryResource
+
+        if isinstance(ipc_memory_resource, PinnedMemoryResource):
+            pytest.skip("Event IPC test temporarily skipped for PinnedMemoryResource (TODO: investigate)")
+
         log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING)
         device = ipc_device
         mr = ipc_memory_resource
diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 7fe65b2b4a..74623eecf2 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -5,6 +5,7 @@
 import multiprocessing.reduction
 import os
 
+import pytest
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
@@ -132,6 +133,18 @@ class TestObjectPassing:
     """
 
     def test_main(self, ipc_device, ipc_memory_resource):
+        # TODO: This test fails with PinnedMR due to CUDA_ERROR_ALREADY_MAPPED.
+        # When buffer1 is passed as an argument, it's serialized and mapped into
+        # the child process. Then trying to recreate it from descriptor causes
+        # "already mapped" error. This might be a test design issue or a real
+        # difference in how PMR vs DMR handle double-mapping. Needs investigation.
+        from cuda.core.experimental import PinnedMemoryResource
+
+        if isinstance(ipc_memory_resource, PinnedMemoryResource):
+            pytest.skip(
+                "TestObjectPassing temporarily skipped for PinnedMR (TODO: investigate CUDA_ERROR_ALREADY_MAPPED)"
+            )
+
         # Define the objects.
         device = ipc_device
         mr = ipc_memory_resource
@@ -154,7 +167,16 @@ def test_main(self, ipc_device, ipc_memory_resource):
     def child_main(self, alloc_handle, mr1, buffer_desc, buffer1):
         device = Device()
         device.set_current()
-        mr2 = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)
+
+        # Recreate MR from allocation handle using the same type as mr1
+        # For DMR, we need to pass device; for PMR, we don't
+        from cuda.core.experimental import DeviceMemoryResource
+
+        if type(mr1) is DeviceMemoryResource:
+            mr2 = type(mr1).from_allocation_handle(device, alloc_handle)
+        else:
+            mr2 = type(mr1).from_allocation_handle(alloc_handle)
+
         pgen = PatternGen(device, NBYTES)
 
         # OK to build the buffer from either mr and the descriptor.

From 94cc940eb4b785c5b343adc4a61a29369718f555 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 12 Dec 2025 06:32:06 +0000
Subject: [PATCH 07/14] fix triple free in the child process in
 TestObjectPassing

---
 cuda_core/tests/memory_ipc/test_serialize.py | 63 ++++----------------
 1 file changed, 10 insertions(+), 53 deletions(-)

diff --git a/cuda_core/tests/memory_ipc/test_serialize.py b/cuda_core/tests/memory_ipc/test_serialize.py
index 74623eecf2..f5686db28c 100644
--- a/cuda_core/tests/memory_ipc/test_serialize.py
+++ b/cuda_core/tests/memory_ipc/test_serialize.py
@@ -5,7 +5,6 @@
 import multiprocessing.reduction
 import os
 
-import pytest
 from cuda.core.experimental import Buffer, Device, DeviceMemoryResource
 from helpers.buffers import PatternGen
 
@@ -133,18 +132,6 @@ class TestObjectPassing:
     """
 
     def test_main(self, ipc_device, ipc_memory_resource):
-        # TODO: This test fails with PinnedMR due to CUDA_ERROR_ALREADY_MAPPED.
-        # When buffer1 is passed as an argument, it's serialized and mapped into
-        # the child process. Then trying to recreate it from descriptor causes
-        # "already mapped" error. This might be a test design issue or a real
-        # difference in how PMR vs DMR handle double-mapping. Needs investigation.
-        from cuda.core.experimental import PinnedMemoryResource
-
-        if isinstance(ipc_memory_resource, PinnedMemoryResource):
-            pytest.skip(
-                "TestObjectPassing temporarily skipped for PinnedMR (TODO: investigate CUDA_ERROR_ALREADY_MAPPED)"
-            )
-
         # Define the objects.
         device = ipc_device
         mr = ipc_memory_resource
@@ -164,50 +151,20 @@ def test_main(self, ipc_device, ipc_memory_resource):
         pgen.verify_buffer(buffer, seed=True)
         buffer.close()
 
-    def child_main(self, alloc_handle, mr1, buffer_desc, buffer1):
+    def child_main(self, alloc_handle, mr1, buffer_desc, buffer):
         device = Device()
         device.set_current()
-
-        # Recreate MR from allocation handle using the same type as mr1
-        # For DMR, we need to pass device; for PMR, we don't
-        from cuda.core.experimental import DeviceMemoryResource
-
-        if type(mr1) is DeviceMemoryResource:
-            mr2 = type(mr1).from_allocation_handle(device, alloc_handle)
-        else:
-            mr2 = type(mr1).from_allocation_handle(alloc_handle)
-
+        mr2 = DeviceMemoryResource.from_allocation_handle(device, alloc_handle)  # noqa: F841
         pgen = PatternGen(device, NBYTES)
 
-        # OK to build the buffer from either mr and the descriptor.
-        # All buffer* objects point to the same memory.
-        buffer2 = Buffer.from_ipc_descriptor(mr1, buffer_desc)
-        buffer3 = Buffer.from_ipc_descriptor(mr2, buffer_desc)
-
-        pgen.verify_buffer(buffer1, seed=False)
-        pgen.verify_buffer(buffer2, seed=False)
-        pgen.verify_buffer(buffer3, seed=False)
-
-        # Modify 1.
-        pgen.fill_buffer(buffer1, seed=True)
-
-        pgen.verify_buffer(buffer1, seed=True)
-        pgen.verify_buffer(buffer2, seed=True)
-        pgen.verify_buffer(buffer3, seed=True)
-
-        # Modify 2.
-        pgen.fill_buffer(buffer2, seed=False)
-
-        pgen.verify_buffer(buffer1, seed=False)
-        pgen.verify_buffer(buffer2, seed=False)
-        pgen.verify_buffer(buffer3, seed=False)
+        # Verify initial content
+        pgen.verify_buffer(buffer, seed=False)
 
-        # Modify 3.
-        pgen.fill_buffer(buffer3, seed=True)
+        # Modify the buffer
+        pgen.fill_buffer(buffer, seed=True)
 
-        pgen.verify_buffer(buffer1, seed=True)
-        pgen.verify_buffer(buffer2, seed=True)
-        pgen.verify_buffer(buffer3, seed=True)
+        # Verify modified content
+        pgen.verify_buffer(buffer, seed=True)
 
-        # Close any one buffer.
-        buffer1.close()
+        # Clean up - only ONE free
+        buffer.close()

From 64aa9514521f776fe565a537f9d4f1eff3830193 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 12 Dec 2025 15:39:20 +0000
Subject: [PATCH 08/14] fix IPC event test for PMR

---
 cuda_core/tests/memory_ipc/test_event_ipc.py | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/cuda_core/tests/memory_ipc/test_event_ipc.py b/cuda_core/tests/memory_ipc/test_event_ipc.py
index e69f8592fe..5edf97f2ae 100644
--- a/cuda_core/tests/memory_ipc/test_event_ipc.py
+++ b/cuda_core/tests/memory_ipc/test_event_ipc.py
@@ -18,20 +18,16 @@ class TestEventIpc:
     """Check the basic usage of IPC-enabled events with a latch kernel."""
 
     def test_main(self, ipc_device, ipc_memory_resource):
-        # TODO: This test currently fails with PinnedMemoryResource due to timeout
-        # in child process. The failure is likely unrelated to PMR itself since Event
-        # IPC is independent of memory resource type. Need to investigate the root cause.
-        # For now, skip PMR to avoid redundant testing since this is an Event IPC test.
-        from cuda.core.experimental import PinnedMemoryResource
-
-        if isinstance(ipc_memory_resource, PinnedMemoryResource):
-            pytest.skip("Event IPC test temporarily skipped for PinnedMemoryResource (TODO: investigate)")
-
         log = TimestampedLogger(prefix="parent: ", enabled=ENABLE_LOGGING)
         device = ipc_device
         mr = ipc_memory_resource
         stream1 = device.create_stream()
-        latch = LatchKernel(device)
+        # TODO: We pick a timeout here to ensure forward progress (it needs to be
+        # less than CHILD_TIMEOUT_SEC) when a pinned memory resource is in use,
+        # in which case the call to buffer.copy_from(...) below is a synchronous
+        # operation that blocks the host. But calling the latch kernel here does not
+        # make any sense. We should refactor this test.
+        latch = LatchKernel(device, timeout_sec=5)
 
         # Start the child process.
         q_out, q_in = [mp.Queue() for _ in range(2)]

From f3d9f8220bb2c8fc072c9f5e2b7c7ecda7da0396 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 12 Dec 2025 15:53:42 +0000
Subject: [PATCH 09/14] don't sweat on the numa node check

---
 .../experimental/_memory/_pinned_memory_resource.pyx   | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
index 9c2a0b6834..471813c406 100644
--- a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
@@ -23,8 +23,16 @@ import warnings
 from cuda.core.experimental._utils.cuda_utils import check_multiprocessing_start_method
 
 
+# Cache to ensure NUMA warning is only raised once per process
+cdef bint _numa_warning_shown = False
+
+
 def _check_numa_nodes():
     """Check if system has multiple NUMA nodes and warn if so."""
+    global _numa_warning_shown
+    if _numa_warning_shown:
+        return
+
     if platform.system() != "Linux":
         return
 
@@ -67,6 +75,8 @@ def _check_numa_nodes():
             stacklevel=3
         )
 
+    _numa_warning_shown = True
+
 
 __all__ = ['PinnedMemoryResource', 'PinnedMemoryResourceOptions']
 

From f3ed1084b2f985483ef8e570cc294ea361df9bd3 Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 12 Dec 2025 20:11:31 +0000
Subject: [PATCH 10/14] avoid awkward Device.set_current in PMR

---
 .../_memory/_pinned_memory_resource.pyx             | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
index 471813c406..f5395308e5 100644
--- a/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_pinned_memory_resource.pyx
@@ -10,6 +10,7 @@ from cuda.core.experimental._memory cimport _ipc
 from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle
 from cuda.core.experimental._utils.cuda_utils cimport (
     check_or_create_options,
+    HANDLE_RETURN,
 )
 
 from dataclasses import dataclass
@@ -207,12 +208,12 @@ cdef class PinnedMemoryResource(_MemPool):
         -------
             A new host-pinned memory resource instance with the imported handle.
         """
-        # TODO: Investigate if we need to initialize CUDA here. Currently required
-        # to avoid CUDA_ERROR_NOT_INITIALIZED in cuMemPoolImportFromShareableHandle.
-        # DMR doesn't explicitly do this, but it requires device_id parameter which
-        # may implicitly initialize CUDA. Need to find a cleaner solution.
-        from .._device import Device
-        Device(0).set_current()
+        # cuMemPoolImportFromShareableHandle requires CUDA to be initialized, but in
+        # a child process CUDA may not be initialized yet. For DeviceMemoryResource,
+        # this is not a concern because most likely when retrieving the device_id the
+        # user would have already initialized CUDA. But since PinnedMemoryResource is
+        # not device-specific it is unlikelt the case.
+        HANDLE_RETURN(cydriver.cuInit(0))
 
         cdef PinnedMemoryResource mr = <PinnedMemoryResource>(
             _ipc.MP_from_allocation_handle(cls, alloc_handle))

From 4c9e80b604a6e615cccaca302a983c7ca3b63eac Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 12 Dec 2025 20:46:00 +0000
Subject: [PATCH 11/14] MMR fixes

---
 .../_memory/_managed_memory_resource.pyx      | 104 +++---------------
 cuda_core/tests/test_memory.py                |  23 ++--
 2 files changed, 24 insertions(+), 103 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
index b3f98f59bb..bb3e7ddfff 100644
--- a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
@@ -25,12 +25,13 @@ cdef class ManagedMemoryResourceOptions:
 
     Attributes
     ----------
-    preferred_location : int, optional
+    preferred_location : int | None, optional
         The preferred device location for the managed memory.
-        Use a device ID (0, 1, 2, ...) for device preference, or -1 for CPU/host.
-        (Default to -1 for CPU/host)
+        Use a device ID (0, 1, 2, ...) for device preference, -1 for CPU/host,
+        or None to let the driver decide.
+        (Default to None)
     """
-    preferred_location : int = -1
+    preferred_location : Optional[int] = None
 
 
 cdef class ManagedMemoryResource(_MemPool):
@@ -69,15 +70,21 @@ cdef class ManagedMemoryResource(_MemPool):
         )
         cdef _MemPoolOptions opts_base = _MemPoolOptions()
 
-        cdef int device_id = -1  # Default: CPU/host preference
+        cdef int device_id = -1
+        cdef object preferred_location = None
         if opts:
-            device_id = opts.preferred_location
+            preferred_location = opts.preferred_location
+            if preferred_location is not None:
+                device_id = preferred_location
             opts_base._use_current = False
 
         opts_base._ipc_enabled = False  # IPC not supported for managed memory pools
 
         # Set location based on preferred_location
-        if device_id == -1:
+        if preferred_location is None:
+            # Let the driver decide
+            opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_NONE
+        elif device_id == -1:
             # CPU/host preference
             opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
         else:
@@ -88,73 +95,6 @@ cdef class ManagedMemoryResource(_MemPool):
 
         super().__init__(device_id, opts_base)
 
-    def __reduce__(self):
-        return ManagedMemoryResource.from_registry, (self.uuid,)
-
-    @staticmethod
-    def from_registry(uuid: uuid.UUID) -> ManagedMemoryResource:  # no-cython-lint
-        """
-        Obtain a registered mapped memory resource.
-
-        Raises
-        ------
-        RuntimeError
-            If no mapped memory resource is found in the registry.
-        """
-        return <ManagedMemoryResource>(_ipc.MP_from_registry(uuid))
-
-    def register(self, uuid: uuid.UUID) -> ManagedMemoryResource:  # no-cython-lint
-        """
-        Register a mapped memory resource.
-
-        Returns
-        -------
-        The registered mapped memory resource. If one was previously registered
-        with the given key, it is returned.
-        """
-        return <ManagedMemoryResource>(_ipc.MP_register(self, uuid))
-
-    @classmethod
-    def from_allocation_handle(
-        cls, alloc_handle: int | IPCAllocationHandle
-    ) -> ManagedMemoryResource:
-        """Create a managed memory resource from an allocation handle.
-
-        Construct a new `ManagedMemoryResource` instance that imports a memory
-        pool from a shareable handle. The memory pool is marked as owned.
-
-        Parameters
-        ----------
-        alloc_handle : int | IPCAllocationHandle
-            The shareable handle of the managed memory resource to import. If an
-            integer is supplied, it must represent a valid platform-specific
-            handle. It is the caller's responsibility to close that handle.
-
-        Returns
-        -------
-            A new managed memory resource instance with the imported handle.
-        """
-        cdef ManagedMemoryResource mr = <ManagedMemoryResource>(
-            _ipc.MP_from_allocation_handle(cls, alloc_handle))
-        return mr
-
-    def get_allocation_handle(self) -> IPCAllocationHandle:
-        """Export the memory pool handle to be shared (requires IPC).
-
-        The handle can be used to share the memory pool with other processes.
-        The handle is cached in this `MemoryResource` and owned by it.
-
-        Returns
-        -------
-            The shareable handle for the memory pool.
-
-        Raises
-        ------
-        RuntimeError
-            IPC is not currently supported for managed memory pools.
-        """
-        raise RuntimeError("IPC is not currently supported for managed memory pools")
-
     @property
     def is_device_accessible(self) -> bool:
         """Return True. This memory resource provides device-accessible buffers."""
@@ -177,19 +117,3 @@ cdef class ManagedMemoryResource(_MemPool):
         another process.  If True, allocation is not permitted.
         """
         return self._ipc_data is not None and self._ipc_data._is_mapped
-
-    @property
-    def uuid(self) -> Optional[uuid.UUID]:
-        """
-        A universally unique identifier for this memory resource. Meaningful
-        only for IPC-enabled memory resources.
-        """
-        return getattr(self._ipc_data, 'uuid', None)
-
-
-def _deep_reduce_managed_memory_resource(mr):
-    raise RuntimeError("IPC is not currently supported for managed memory pools")
-
-
-# Multiprocessing support disabled until IPC is supported for managed memory pools
-# multiprocessing.reduction.register(ManagedMemoryResource, _deep_reduce_managed_memory_resource)
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index a36ee3905e..1997e962e7 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -575,20 +575,18 @@ def test_pinned_memory_resource_initialization(init_cuda):
 
 def test_managed_memory_resource_initialization(init_cuda):
     device = Device()
-    if not device.properties.memory_pools_supported:
-        pytest.skip("Device does not support mempool operations")
+    if not device.properties.memory_pools_supported or not device.properties.managed_memory:
+        pytest.skip("Device does not support managed memory pool operations")
 
     device.set_current()
 
     mr = ManagedMemoryResource()
     assert mr.is_device_accessible
     assert mr.is_host_accessible
-    assert mr.device_id == -1  # Default preferred location is CPU
 
     # Test allocation/deallocation works
     buffer = mr.allocate(1024)
     assert buffer.size == 1024
-    assert buffer.device_id == -1  # Managed memory with CPU preference
     assert buffer.is_host_accessible  # But accessible from host
     assert buffer.memory_resource == mr
     assert buffer.is_device_accessible
@@ -892,15 +890,14 @@ def test_pinned_memory_resource_with_options(init_cuda):
 
 def test_managed_memory_resource_with_options(init_cuda):
     device = Device()
-    if not device.properties.memory_pools_supported:
-        pytest.skip("Device does not support mempool operations")
+    if not device.properties.memory_pools_supported or not device.properties.managed_memory:
+        pytest.skip("Device does not support managed memory pool operations")
 
     device.set_current()
 
     # Test basic pool creation
     options = ManagedMemoryResourceOptions()
     mr = ManagedMemoryResource(options)
-    assert mr.device_id == -1  # Default preferred location is CPU
     assert mr.is_device_accessible
     assert mr.is_host_accessible
     assert not mr.is_ipc_enabled
@@ -1053,8 +1050,8 @@ def test_mempool_attributes(ipc_enabled, memory_resource_factory, property_name,
         pytest.skip("Device does not support mempool operations")
     elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
         pytest.skip("Device does not support host mempool operations")
-    elif MR is ManagedMemoryResource and not device.properties.memory_pools_supported:
-        pytest.skip("Device does not support mempool operations")
+    elif MR is ManagedMemoryResource and (not device.properties.memory_pools_supported or not device.properties.managed_memory):
+        pytest.skip("Device does not support managed memory pool operations")
 
     # ManagedMemoryResource does not support IPC
     if MR is ManagedMemoryResource and ipc_enabled:
@@ -1121,8 +1118,8 @@ def test_mempool_attributes_repr(memory_resource_factory):
         pytest.skip("Device does not support mempool operations")
     elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
         pytest.skip("Device does not support host mempool operations")
-    elif MR is ManagedMemoryResource and not device.properties.memory_pools_supported:
-        pytest.skip("Device does not support mempool operations")
+    elif MR is ManagedMemoryResource and (not device.properties.memory_pools_supported or not device.properties.managed_memory):
+        pytest.skip("Device does not support managed memory pool operations")
 
     device.set_current()
 
@@ -1154,8 +1151,8 @@ def test_mempool_attributes_ownership(memory_resource_factory):
         pytest.skip("Device does not support mempool operations")
     elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
         pytest.skip("Device does not support host mempool operations")
-    elif MR is ManagedMemoryResource and not device.properties.memory_pools_supported:
-        pytest.skip("Device does not support mempool operations")
+    elif MR is ManagedMemoryResource and (not device.properties.memory_pools_supported or not device.properties.managed_memory):
+        pytest.skip("Device does not support managed memory pool operations")
 
     # Skip if IPC mempool is not supported on this platform/device (only relevant for DeviceMemoryResource)
     if MR is DeviceMemoryResource and not supports_ipc_mempool(device):

From e2b9ea510e8956399f3ad1c09cadecd02d87fcbc Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 12 Dec 2025 20:51:06 +0000
Subject: [PATCH 12/14] add docs

---
 .../_memory/_managed_memory_resource.pyx             |  3 ---
 cuda_core/docs/source/api.rst                        |  4 ++++
 cuda_core/docs/source/release/0.5.x-notes.rst        |  5 ++++-
 cuda_core/tests/test_memory.py                       | 12 +++++++++---
 4 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
index bb3e7ddfff..8f2b936be2 100644
--- a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
@@ -6,15 +6,12 @@ from __future__ import annotations
 
 from cuda.bindings cimport cydriver
 from cuda.core.experimental._memory._memory_pool cimport _MemPool, _MemPoolOptions
-from cuda.core.experimental._memory cimport _ipc
-from cuda.core.experimental._memory._ipc cimport IPCAllocationHandle
 from cuda.core.experimental._utils.cuda_utils cimport (
     check_or_create_options,
 )
 
 from dataclasses import dataclass
 from typing import Optional
-import uuid
 
 __all__ = ['ManagedMemoryResource', 'ManagedMemoryResourceOptions']
 
diff --git a/cuda_core/docs/source/api.rst b/cuda_core/docs/source/api.rst
index 45be638eb6..1feeba5b12 100644
--- a/cuda_core/docs/source/api.rst
+++ b/cuda_core/docs/source/api.rst
@@ -26,12 +26,16 @@ CUDA runtime
    Event
    MemoryResource
    DeviceMemoryResource
+   PinnedMemoryResource
+   ManagedMemoryResource
    LegacyPinnedMemoryResource
    VirtualMemoryResource
 
    :template: dataclass.rst
 
    DeviceMemoryResourceOptions
+   PinnedMemoryResourceOptions
+   ManagedMemoryResourceOptions
    EventOptions
    GraphCompleteOptions
    GraphDebugPrintOptions
diff --git a/cuda_core/docs/source/release/0.5.x-notes.rst b/cuda_core/docs/source/release/0.5.x-notes.rst
index 4626a770c1..5b1378963a 100644
--- a/cuda_core/docs/source/release/0.5.x-notes.rst
+++ b/cuda_core/docs/source/release/0.5.x-notes.rst
@@ -21,7 +21,10 @@ None.
 New features
 ------------
 
-None.
+- Added :class:`PinnedMemoryResource` and :class:`PinnedMemoryResourceOptions` for managing
+  host-pinned memory pools with optional IPC support.
+- Added :class:`ManagedMemoryResource` and :class:`ManagedMemoryResourceOptions` for managing
+  unified memory pools accessible from both host and device.
 
 
 New examples
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 1997e962e7..505161339f 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -1050,7 +1050,9 @@ def test_mempool_attributes(ipc_enabled, memory_resource_factory, property_name,
         pytest.skip("Device does not support mempool operations")
     elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
         pytest.skip("Device does not support host mempool operations")
-    elif MR is ManagedMemoryResource and (not device.properties.memory_pools_supported or not device.properties.managed_memory):
+    elif MR is ManagedMemoryResource and (
+        not device.properties.memory_pools_supported or not device.properties.managed_memory
+    ):
         pytest.skip("Device does not support managed memory pool operations")
 
     # ManagedMemoryResource does not support IPC
@@ -1118,7 +1120,9 @@ def test_mempool_attributes_repr(memory_resource_factory):
         pytest.skip("Device does not support mempool operations")
     elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
         pytest.skip("Device does not support host mempool operations")
-    elif MR is ManagedMemoryResource and (not device.properties.memory_pools_supported or not device.properties.managed_memory):
+    elif MR is ManagedMemoryResource and (
+        not device.properties.memory_pools_supported or not device.properties.managed_memory
+    ):
         pytest.skip("Device does not support managed memory pool operations")
 
     device.set_current()
@@ -1151,7 +1155,9 @@ def test_mempool_attributes_ownership(memory_resource_factory):
         pytest.skip("Device does not support mempool operations")
     elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
         pytest.skip("Device does not support host mempool operations")
-    elif MR is ManagedMemoryResource and (not device.properties.memory_pools_supported or not device.properties.managed_memory):
+    elif MR is ManagedMemoryResource and (
+        not device.properties.memory_pools_supported or not device.properties.managed_memory
+    ):
         pytest.skip("Device does not support managed memory pool operations")
 
     # Skip if IPC mempool is not supported on this platform/device (only relevant for DeviceMemoryResource)

From 0c22b3139205449e2723b6927412722ba86b26ec Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Fri, 12 Dec 2025 23:36:09 +0000
Subject: [PATCH 13/14] fix for CUDA 12

---
 .../_memory/_managed_memory_resource.pyx      | 27 ++++----
 .../experimental/_memory/_memory_pool.pyx     | 22 ++++---
 cuda_core/tests/conftest.py                   | 60 ++++++++++++++++-
 cuda_core/tests/test_memory.py                | 65 +++++++++----------
 4 files changed, 115 insertions(+), 59 deletions(-)

diff --git a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx b/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
index 8f2b936be2..7636213a63 100644
--- a/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_managed_memory_resource.pyx
@@ -77,18 +77,21 @@ cdef class ManagedMemoryResource(_MemPool):
 
         opts_base._ipc_enabled = False  # IPC not supported for managed memory pools
 
-        # Set location based on preferred_location
-        if preferred_location is None:
-            # Let the driver decide
-            opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_NONE
-        elif device_id == -1:
-            # CPU/host preference
-            opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
-        else:
-            # Device preference
-            opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
-
-        opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED
+        IF CUDA_CORE_BUILD_MAJOR >= 13:
+            # Set location based on preferred_location
+            if preferred_location is None:
+                # Let the driver decide
+                opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_NONE
+            elif device_id == -1:
+                # CPU/host preference
+                opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_HOST
+            else:
+                # Device preference
+                opts_base._location = cydriver.CUmemLocationType.CU_MEM_LOCATION_TYPE_DEVICE
+
+            opts_base._type = cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED
+        ELSE:
+            raise RuntimeError("ManagedMemoryResource requires CUDA 13.0 or later")
 
         super().__init__(device_id, opts_base)
 
diff --git a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
index 5a6c240b09..dbbcc75715 100644
--- a/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
+++ b/cuda_core/cuda/core/experimental/_memory/_memory_pool.pyx
@@ -334,16 +334,17 @@ cdef int _MP_init_current(_MemPool self, int dev_id, _MemPoolOptions opts) excep
                 HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type))
             ELSE:
                 raise RuntimeError("not supported")
-        elif opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED:
-            # Managed memory pools
+        else:
             IF CUDA_CORE_BUILD_MAJOR >= 13:
-                loc.id = dev_id
-                loc.type = opts._location
-                HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type))
+                if opts._type == cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED:
+                    # Managed memory pools
+                    loc.id = dev_id
+                    loc.type = opts._location
+                    HANDLE_RETURN(cydriver.cuMemGetMemPool(&(self._handle), &loc, opts._type))
+                else:
+                    assert False
             ELSE:
-                raise RuntimeError("Managed memory pools not supported in CUDA < 13")
-        else:
-            assert False
+                assert False
 
     return 0
 
@@ -358,7 +359,10 @@ cdef int _MP_init_create(_MemPool self, int dev_id, _MemPoolOptions opts) except
     properties.location.id = dev_id
     properties.location.type = opts._location
     # managed memory does not support maxSize as of CUDA 13.0
-    if properties.allocType != cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED:
+    IF CUDA_CORE_BUILD_MAJOR >= 13:
+        if properties.allocType != cydriver.CUmemAllocationType.CU_MEM_ALLOCATION_TYPE_MANAGED:
+            properties.maxSize = opts._max_size
+    ELSE:
         properties.maxSize = opts._max_size
 
     self._dev_id = dev_id
diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index dca2d9c58a..7b70990f54 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -26,6 +26,63 @@
 from cuda.core.experimental._utils.cuda_utils import handle_return
 
 
+def _check_pinned_memory_available():
+    """Check if PinnedMemoryResource is available (CUDA 13.0+)."""
+    try:
+        device = Device()
+        return hasattr(device.properties, "host_memory_pools_supported")
+    except Exception:
+        return False
+
+
+def _check_managed_memory_available():
+    """Check if ManagedMemoryResource is available (CUDA 13.0+)."""
+    try:
+        device = Device()
+        return hasattr(device.properties, "memory_pools_supported") and hasattr(device.properties, "managed_memory")
+    except Exception:
+        return False
+
+
+# Skip marks for tests requiring CUDA 13.0+
+skipif_pinned_memory_unavailable = pytest.mark.skipif(
+    not _check_pinned_memory_available(), reason="PinnedMemoryResource requires CUDA 13.0 or later"
+)
+
+skipif_managed_memory_unavailable = pytest.mark.skipif(
+    not _check_managed_memory_available(), reason="ManagedMemoryResource requires CUDA 13.0 or later"
+)
+
+
+# Helper functions for runtime checks within tests
+def skip_if_pinned_memory_unsupported(device):
+    """Skip test if device doesn't support host memory pools or CUDA < 13."""
+    try:
+        if not device.properties.host_memory_pools_supported:
+            pytest.skip("Device does not support host mempool operations")
+    except AttributeError:
+        pytest.skip("PinnedMemoryResource requires CUDA 13.0 or later")
+
+
+def skip_if_managed_memory_unsupported(device):
+    """Skip test if device doesn't support managed memory pools or CUDA < 13."""
+    try:
+        if not device.properties.memory_pools_supported or not device.properties.managed_memory:
+            pytest.skip("Device does not support managed memory pool operations")
+    except AttributeError:
+        pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
+
+
+def create_managed_memory_resource_or_skip(*args, **kwargs):
+    """Create ManagedMemoryResource, skipping test if CUDA 13.0+ required."""
+    try:
+        return ManagedMemoryResource(*args, **kwargs)
+    except RuntimeError as e:
+        if "requires CUDA 13.0" in str(e):
+            pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
+        raise
+
+
 @pytest.fixture(scope="session", autouse=True)
 def session_setup():
     # Always init CUDA.
@@ -126,8 +183,7 @@ def ipc_memory_resource(request, ipc_device):
         options = DeviceMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
         mr = DeviceMemoryResource(ipc_device, options=options)
     else:  # pinned
-        if not ipc_device.properties.host_memory_pools_supported:
-            pytest.skip("Device does not support host mempool operations")
+        skip_if_pinned_memory_unsupported(ipc_device)
         options = PinnedMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=True)
         mr = PinnedMemoryResource(options=options)
 
diff --git a/cuda_core/tests/test_memory.py b/cuda_core/tests/test_memory.py
index 505161339f..fd2d1f1b08 100644
--- a/cuda_core/tests/test_memory.py
+++ b/cuda_core/tests/test_memory.py
@@ -41,6 +41,11 @@
 from helpers import IS_WINDOWS
 from helpers.buffers import DummyUnifiedMemoryResource
 
+from conftest import (
+    create_managed_memory_resource_or_skip,
+    skip_if_managed_memory_unsupported,
+    skip_if_pinned_memory_unsupported,
+)
 from cuda_python_test_helpers import supports_ipc_mempool
 
 POOL_SIZE = 2097152  # 2MB size
@@ -554,8 +559,7 @@ def test_device_memory_resource_initialization(use_device_object):
 
 def test_pinned_memory_resource_initialization(init_cuda):
     device = Device()
-    if not device.properties.host_memory_pools_supported:
-        pytest.skip("Device does not support host mempool operations")
+    skip_if_pinned_memory_unsupported(device)
 
     device.set_current()
 
@@ -575,12 +579,11 @@ def test_pinned_memory_resource_initialization(init_cuda):
 
 def test_managed_memory_resource_initialization(init_cuda):
     device = Device()
-    if not device.properties.memory_pools_supported or not device.properties.managed_memory:
-        pytest.skip("Device does not support managed memory pool operations")
+    skip_if_managed_memory_unsupported(device)
 
     device.set_current()
 
-    mr = ManagedMemoryResource()
+    mr = create_managed_memory_resource_or_skip()
     assert mr.is_device_accessible
     assert mr.is_host_accessible
 
@@ -843,8 +846,7 @@ def test_device_memory_resource_with_options(init_cuda):
 
 def test_pinned_memory_resource_with_options(init_cuda):
     device = Device()
-    if not device.properties.host_memory_pools_supported:
-        pytest.skip("Device does not support host mempool operations")
+    skip_if_pinned_memory_unsupported(device)
 
     device.set_current()
 
@@ -890,14 +892,13 @@ def test_pinned_memory_resource_with_options(init_cuda):
 
 def test_managed_memory_resource_with_options(init_cuda):
     device = Device()
-    if not device.properties.memory_pools_supported or not device.properties.managed_memory:
-        pytest.skip("Device does not support managed memory pool operations")
+    skip_if_managed_memory_unsupported(device)
 
     device.set_current()
 
     # Test basic pool creation
     options = ManagedMemoryResourceOptions()
-    mr = ManagedMemoryResource(options)
+    mr = create_managed_memory_resource_or_skip(options)
     assert mr.is_device_accessible
     assert mr.is_host_accessible
     assert not mr.is_ipc_enabled
@@ -960,8 +961,7 @@ def test_pinned_mempool_ipc_basic():
     device = Device()
     device.set_current()
 
-    if not device.properties.host_memory_pools_supported:
-        pytest.skip("Device does not support host mempool operations")
+    skip_if_pinned_memory_unsupported(device)
 
     if platform.system() == "Windows":
         pytest.skip("IPC not implemented for Windows")
@@ -1001,8 +1001,7 @@ def test_pinned_mempool_ipc_errors():
     device = Device()
     device.set_current()
 
-    if not device.properties.host_memory_pools_supported:
-        pytest.skip("Device does not support host mempool operations")
+    skip_if_pinned_memory_unsupported(device)
 
     # Test with IPC disabled (default)
     options = PinnedMemoryResourceOptions(max_size=POOL_SIZE, ipc_enabled=False)
@@ -1048,12 +1047,10 @@ def test_mempool_attributes(ipc_enabled, memory_resource_factory, property_name,
 
     if MR is DeviceMemoryResource and not device.properties.memory_pools_supported:
         pytest.skip("Device does not support mempool operations")
-    elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
-        pytest.skip("Device does not support host mempool operations")
-    elif MR is ManagedMemoryResource and (
-        not device.properties.memory_pools_supported or not device.properties.managed_memory
-    ):
-        pytest.skip("Device does not support managed memory pool operations")
+    elif MR is PinnedMemoryResource:
+        skip_if_pinned_memory_unsupported(device)
+    elif MR is ManagedMemoryResource:
+        skip_if_managed_memory_unsupported(device)
 
     # ManagedMemoryResource does not support IPC
     if MR is ManagedMemoryResource and ipc_enabled:
@@ -1077,7 +1074,7 @@ def test_mempool_attributes(ipc_enabled, memory_resource_factory, property_name,
         assert mr.is_ipc_enabled == ipc_enabled
     elif MR is ManagedMemoryResource:
         options = MRops()
-        mr = MR(options)
+        mr = create_managed_memory_resource_or_skip(options)
         assert not mr.is_ipc_enabled
 
     # Get the property value
@@ -1118,12 +1115,10 @@ def test_mempool_attributes_repr(memory_resource_factory):
 
     if MR is DeviceMemoryResource and not device.properties.memory_pools_supported:
         pytest.skip("Device does not support mempool operations")
-    elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
-        pytest.skip("Device does not support host mempool operations")
-    elif MR is ManagedMemoryResource and (
-        not device.properties.memory_pools_supported or not device.properties.managed_memory
-    ):
-        pytest.skip("Device does not support managed memory pool operations")
+    elif MR is PinnedMemoryResource:
+        skip_if_pinned_memory_unsupported(device)
+    elif MR is ManagedMemoryResource:
+        skip_if_managed_memory_unsupported(device)
 
     device.set_current()
 
@@ -1132,7 +1127,7 @@ def test_mempool_attributes_repr(memory_resource_factory):
     elif MR is PinnedMemoryResource:
         mr = MR(options={"max_size": 2048})
     elif MR is ManagedMemoryResource:
-        mr = MR(options={})
+        mr = create_managed_memory_resource_or_skip(options={})
 
     buffer1 = mr.allocate(64)
     buffer2 = mr.allocate(64)
@@ -1153,12 +1148,10 @@ def test_mempool_attributes_ownership(memory_resource_factory):
 
     if MR is DeviceMemoryResource and not device.properties.memory_pools_supported:
         pytest.skip("Device does not support mempool operations")
-    elif MR is PinnedMemoryResource and not device.properties.host_memory_pools_supported:
-        pytest.skip("Device does not support host mempool operations")
-    elif MR is ManagedMemoryResource and (
-        not device.properties.memory_pools_supported or not device.properties.managed_memory
-    ):
-        pytest.skip("Device does not support managed memory pool operations")
+    elif MR is PinnedMemoryResource:
+        skip_if_pinned_memory_unsupported(device)
+    elif MR is ManagedMemoryResource:
+        skip_if_managed_memory_unsupported(device)
 
     # Skip if IPC mempool is not supported on this platform/device (only relevant for DeviceMemoryResource)
     if MR is DeviceMemoryResource and not supports_ipc_mempool(device):
@@ -1171,7 +1164,7 @@ def test_mempool_attributes_ownership(memory_resource_factory):
     elif MR is PinnedMemoryResource:
         mr = MR(dict(max_size=POOL_SIZE))
     elif MR is ManagedMemoryResource:
-        mr = MR(dict())
+        mr = create_managed_memory_resource_or_skip(dict())
 
     attributes = mr.attributes
     mr.close()
@@ -1188,7 +1181,7 @@ def test_mempool_attributes_ownership(memory_resource_factory):
     elif MR is PinnedMemoryResource:
         mr = MR(dict(max_size=POOL_SIZE))  # noqa: F841
     elif MR is ManagedMemoryResource:
-        mr = MR(dict())  # noqa: F841
+        mr = create_managed_memory_resource_or_skip(dict())  # noqa: F841
 
     with pytest.raises(RuntimeError, match="is expired"):
         _ = attributes.used_mem_high

From f9e3c55f80e708f1838071886aea77a7b3ad807a Mon Sep 17 00:00:00 2001
From: Leo Fang <leof@nvidia.com>
Date: Sat, 13 Dec 2025 00:46:43 +0000
Subject: [PATCH 14/14] fix enum check based on driver team feedback + remove
 redudant code

---
 cuda_core/tests/conftest.py | 34 +---------------------------------
 1 file changed, 1 insertion(+), 33 deletions(-)

diff --git a/cuda_core/tests/conftest.py b/cuda_core/tests/conftest.py
index 7b70990f54..95539df16a 100644
--- a/cuda_core/tests/conftest.py
+++ b/cuda_core/tests/conftest.py
@@ -26,37 +26,7 @@
 from cuda.core.experimental._utils.cuda_utils import handle_return
 
 
-def _check_pinned_memory_available():
-    """Check if PinnedMemoryResource is available (CUDA 13.0+)."""
-    try:
-        device = Device()
-        return hasattr(device.properties, "host_memory_pools_supported")
-    except Exception:
-        return False
-
-
-def _check_managed_memory_available():
-    """Check if ManagedMemoryResource is available (CUDA 13.0+)."""
-    try:
-        device = Device()
-        return hasattr(device.properties, "memory_pools_supported") and hasattr(device.properties, "managed_memory")
-    except Exception:
-        return False
-
-
-# Skip marks for tests requiring CUDA 13.0+
-skipif_pinned_memory_unavailable = pytest.mark.skipif(
-    not _check_pinned_memory_available(), reason="PinnedMemoryResource requires CUDA 13.0 or later"
-)
-
-skipif_managed_memory_unavailable = pytest.mark.skipif(
-    not _check_managed_memory_available(), reason="ManagedMemoryResource requires CUDA 13.0 or later"
-)
-
-
-# Helper functions for runtime checks within tests
 def skip_if_pinned_memory_unsupported(device):
-    """Skip test if device doesn't support host memory pools or CUDA < 13."""
     try:
         if not device.properties.host_memory_pools_supported:
             pytest.skip("Device does not support host mempool operations")
@@ -65,16 +35,14 @@ def skip_if_pinned_memory_unsupported(device):
 
 
 def skip_if_managed_memory_unsupported(device):
-    """Skip test if device doesn't support managed memory pools or CUDA < 13."""
     try:
-        if not device.properties.memory_pools_supported or not device.properties.managed_memory:
+        if not device.properties.memory_pools_supported or not device.properties.concurrent_managed_access:
             pytest.skip("Device does not support managed memory pool operations")
     except AttributeError:
         pytest.skip("ManagedMemoryResource requires CUDA 13.0 or later")
 
 
 def create_managed_memory_resource_or_skip(*args, **kwargs):
-    """Create ManagedMemoryResource, skipping test if CUDA 13.0+ required."""
     try:
         return ManagedMemoryResource(*args, **kwargs)
     except RuntimeError as e: