From 9000cfad0c7ee4932c2def1bc35f38e563c3a310 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=81=E5=8E=9A=E5=AE=8F?= <2695316095@qq.com>
Date: Tue, 21 Apr 2026 11:31:38 +0800
Subject: [PATCH 1/3] cuda: add partial eviction on pool OOM

---
 ggml/src/ggml-cuda/ggml-cuda.cu | 39 +++++++++++++++++++++++++++++----
 1 file changed, 35 insertions(+), 4 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index 185956317e0..b4504e34773 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -428,16 +428,47 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
         look_ahead_size = 256 * ((look_ahead_size + 255)/256);
         ggml_cuda_set_device(device);
         cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
-        if (err == cudaErrorMemoryAllocation) {
+        if (err == cudaErrorMemoryAllocation && pool_size > 0) {
             (void)cudaGetLastError();
             const size_t cached_bytes = pool_size;
-            GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, flushing %.2f MiB of cached buffers and retrying\n",
+            GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, trying partial eviction from %.2f MiB cached\n",
                            device, look_ahead_size/1024.0/1024.0, cached_bytes/1024.0/1024.0);
             CUDA_CHECK(cudaDeviceSynchronize());
-            clear_pool();
+
+            size_t freed = 0;
+            while (freed < look_ahead_size) {
+                int    victim      = -1;
+                size_t victim_size = 0;
+                for (int i = 0; i < MAX_BUFFERS; ++i) {
+                    ggml_cuda_buffer & b = buffer_pool[i];
+                    if (b.ptr != nullptr && b.size > victim_size) {
+                        victim      = i;
+                        victim_size = b.size;
+                    }
+                }
+                if (victim < 0) {
+                    break;
+                }
+                ggml_cuda_buffer & b = buffer_pool[victim];
+                CUDA_CHECK(cudaFree(b.ptr));
+                freed     += b.size;
+                pool_size -= b.size;
+                b.ptr  = nullptr;
+                b.size = 0;
+            }
+
             err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
+
+            if (err == cudaErrorMemoryAllocation && pool_size > 0) {
+                (void)cudaGetLastError();
+                GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: partial eviction freed %.2f MiB, flushing remaining %.2f MiB cached\n",
+                               device, freed/1024.0/1024.0, pool_size/1024.0/1024.0);
+                clear_pool();
+                err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
+            }
+
             if (err == cudaSuccess) {
-                GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded\n", device);
+                GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded after eviction\n", device);
             }
         }
         CUDA_CHECK(err);

From d3e3a69721cef45e4c235cc8a9f018733a0203a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=81=E5=8E=9A=E5=AE=8F?= <2695316095@qq.com>
Date: Tue, 21 Apr 2026 20:35:30 +0800
Subject: [PATCH 2/3] cuda: LRU reclaim + overallocate for legacy pool
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 梁厚宏 <2695316095@qq.com>
---
 ggml/src/ggml-cuda/common.cuh       | 10 +++---
 ggml/src/ggml-cuda/fattn-common.cuh |  4 +--
 ggml/src/ggml-cuda/ggml-cuda.cu     | 53 +++++++++++++++++++++--------
 3 files changed, 45 insertions(+), 22 deletions(-)

diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh
index 3aec1742ee1..3a41c8bb477 100644
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -1105,7 +1105,7 @@ int ggml_cuda_get_device();
 struct ggml_cuda_pool {
     virtual ~ggml_cuda_pool() = default;
 
-    virtual void * alloc(size_t size, size_t * actual_size) = 0;
+    virtual void * alloc(size_t size, size_t * actual_size, bool overallocate = false) = 0;
     virtual void free(void * ptr, size_t size) = 0;
 };
 
@@ -1131,16 +1131,16 @@ struct ggml_cuda_pool_alloc {
     }
 
     // size is in number of elements
-    T * alloc(size_t size) {
+    T * alloc(size_t size, bool overallocate = false) {
         GGML_ASSERT(pool != nullptr);
         GGML_ASSERT(ptr == nullptr);
-        ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size);
+        ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size, overallocate);
         return ptr;
     }
 
-    T * alloc(ggml_cuda_pool & pool, size_t size) {
+    T * alloc(ggml_cuda_pool & pool, size_t size, bool overallocate = false) {
         this->pool = &pool;
-        return alloc(size);
+        return alloc(size, overallocate);
     }
 
     T * get() {
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index beeb5238946..c17440f9b89 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -966,7 +966,7 @@ void launch_fattn(
         const size_t bs = ggml_blck_size(K->type);
         const size_t ts = ggml_type_size(K->type);
 
-        K_f16.alloc(ggml_nelements(K));
+        K_f16.alloc(ggml_nelements(K), /*overallocate=*/ true);
         if (ggml_is_contiguously_allocated(K)) {
             to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type);
             to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream);
@@ -999,7 +999,7 @@ void launch_fattn(
             const size_t bs = ggml_blck_size(V->type);
             const size_t ts = ggml_type_size(V->type);
 
-            V_f16.alloc(ggml_nelements(V));
+            V_f16.alloc(ggml_nelements(V), /*overallocate=*/ true);
             if (ggml_is_contiguously_allocated(V)) {
                 to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type);
                 to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream);
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index b4504e34773..d639871212b 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -363,6 +363,10 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
     ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {};
     size_t pool_size = 0;
 
+    // per-slot timestamp stamped on free(); older = longer uncollected, used for LRU-style reclaim on OOM
+    uint64_t clock = 0;
+    uint64_t buffer_pool_ts[MAX_BUFFERS] = {};
+
     explicit ggml_cuda_pool_leg(int device) :
         device(device) {
     }
@@ -381,11 +385,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
                 pool_size -= b.size;
                 b.ptr  = nullptr;
                 b.size = 0;
+                buffer_pool_ts[i] = 0;
             }
         }
     }
 
-    void * alloc(size_t size, size_t * actual_size) override {
+    void * alloc(size_t size, size_t * actual_size, bool overallocate) override {
 #ifdef DEBUG_CUDA_MALLOC
         int nnz = 0;
         size_t max_size = 0;
@@ -409,6 +414,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
                             *actual_size = b.size;
                             b.ptr = nullptr;
                             b.size = 0;
+                            buffer_pool_ts[i] = 0;
                             return ptr;
                         }
                     }
@@ -421,29 +427,41 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
             *actual_size = b.size;
             b.ptr = nullptr;
             b.size = 0;
+            buffer_pool_ts[ibest] = 0;
             return ptr;
         }
         void * ptr;
-        size_t look_ahead_size = (size_t) (1.05 * size);
-        look_ahead_size = 256 * ((look_ahead_size + 255)/256);
+        size_t look_ahead_size;
+        if (overallocate) {
+            look_ahead_size = (size > SIZE_MAX / 2) ? SIZE_MAX : 2 * size;
+        } else {
+            look_ahead_size = (size_t) (1.05 * size);
+        }
+        // 256-byte align, overflow-safe
+        look_ahead_size = (look_ahead_size > SIZE_MAX - 255)
+                              ? (SIZE_MAX & ~size_t(255))
+                              : 256 * ((look_ahead_size + 255)/256);
         ggml_cuda_set_device(device);
         cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
         if (err == cudaErrorMemoryAllocation && pool_size > 0) {
             (void)cudaGetLastError();
-            const size_t cached_bytes = pool_size;
-            GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, trying partial eviction from %.2f MiB cached\n",
-                           device, look_ahead_size/1024.0/1024.0, cached_bytes/1024.0/1024.0);
+            const size_t cached_bytes   = pool_size;
+            const size_t reclaim_target = (look_ahead_size > SIZE_MAX / 3)
+                                              ? SIZE_MAX
+                                              : 3 * look_ahead_size;
+            GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, trying LRU-style reclaim (target %.2f MiB) from %.2f MiB cached\n",
+                           device, look_ahead_size/1024.0/1024.0, reclaim_target/1024.0/1024.0, cached_bytes/1024.0/1024.0);
             CUDA_CHECK(cudaDeviceSynchronize());
 
             size_t freed = 0;
-            while (freed < look_ahead_size) {
-                int    victim      = -1;
-                size_t victim_size = 0;
+            while (freed < reclaim_target) {
+                int      victim = -1;
+                uint64_t oldest = UINT64_MAX;
                 for (int i = 0; i < MAX_BUFFERS; ++i) {
                     ggml_cuda_buffer & b = buffer_pool[i];
-                    if (b.ptr != nullptr && b.size > victim_size) {
-                        victim      = i;
-                        victim_size = b.size;
+                    if (b.ptr != nullptr && buffer_pool_ts[i] < oldest) {
+                        victim = i;
+                        oldest = buffer_pool_ts[i];
                     }
                 }
                 if (victim < 0) {
@@ -455,20 +473,23 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
                 pool_size -= b.size;
                 b.ptr  = nullptr;
                 b.size = 0;
+                buffer_pool_ts[victim] = 0;
             }
 
             err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
 
+            // terminal fallback if the 3x-bounded reclaim left cached buffers behind
             if (err == cudaErrorMemoryAllocation && pool_size > 0) {
                 (void)cudaGetLastError();
-                GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: partial eviction freed %.2f MiB, flushing remaining %.2f MiB cached\n",
+                GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: reclaim bounded at %.2f MiB, flushing remaining %.2f MiB cached\n",
                                device, freed/1024.0/1024.0, pool_size/1024.0/1024.0);
                 clear_pool();
                 err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
             }
 
             if (err == cudaSuccess) {
-                GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded after eviction\n", device);
+                GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded after reclaiming %.2f MiB\n",
+                               device, freed/1024.0/1024.0);
             }
         }
         CUDA_CHECK(err);
@@ -487,6 +508,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
             if (b.ptr == nullptr) {
                 b.ptr = ptr;
                 b.size = size;
+                buffer_pool_ts[i] = ++clock;
                 return;
             }
         }
@@ -530,7 +552,8 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
         }
     }
 
-    void * alloc(size_t size, size_t * actual_size) override {
+    void * alloc(size_t size, size_t * actual_size, [[maybe_unused]] bool overallocate) override {
+        // overallocate hint is ignored: VMM growth is already granularity-based
         // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types
         const size_t alignment = 128;
         size = alignment * ((size + alignment - 1) / alignment);

From 00afd1b04e5e168b002ef72a67aab965cb797204 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=A2=81=E5=8E=9A=E5=AE=8F?= <2695316095@qq.com>
Date: Thu, 23 Apr 2026 23:13:10 +0800
Subject: [PATCH 3/3] cuda: add HIP multi-GPU guard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Signed-off-by: 梁厚宏 <2695316095@qq.com>
---
 ggml/src/ggml-cuda/ggml-cuda.cu | 91 +++++++++++++++++++--------------
 1 file changed, 53 insertions(+), 38 deletions(-)

diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
index d639871212b..5e49449c84d 100644
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -445,51 +445,66 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
         cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
         if (err == cudaErrorMemoryAllocation && pool_size > 0) {
             (void)cudaGetLastError();
-            const size_t cached_bytes   = pool_size;
-            const size_t reclaim_target = (look_ahead_size > SIZE_MAX / 3)
-                                              ? SIZE_MAX
-                                              : 3 * look_ahead_size;
-            GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, trying LRU-style reclaim (target %.2f MiB) from %.2f MiB cached\n",
-                           device, look_ahead_size/1024.0/1024.0, reclaim_target/1024.0/1024.0, cached_bytes/1024.0/1024.0);
             CUDA_CHECK(cudaDeviceSynchronize());
 
-            size_t freed = 0;
-            while (freed < reclaim_target) {
-                int      victim = -1;
-                uint64_t oldest = UINT64_MAX;
-                for (int i = 0; i < MAX_BUFFERS; ++i) {
-                    ggml_cuda_buffer & b = buffer_pool[i];
-                    if (b.ptr != nullptr && buffer_pool_ts[i] < oldest) {
-                        victim = i;
-                        oldest = buffer_pool_ts[i];
-                    }
+#if defined(GGML_USE_HIP)
+            // HIP multi-GPU: LRU timing amplifies ROCm/rocm-systems#4817; fall back to clear_pool.
+            if (ggml_backend_cuda_get_device_count() > 1) {
+                GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: HIP multi-GPU, flushing %.2f MiB of cached buffers and retrying\n",
+                               device, pool_size/1024.0/1024.0);
+                clear_pool();
+                err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
+                if (err == cudaSuccess) {
+                    GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded\n", device);
                 }
-                if (victim < 0) {
-                    break;
+            } else
+#endif
+            {
+                const size_t cached_bytes   = pool_size;
+                const size_t reclaim_target = (look_ahead_size > SIZE_MAX / 3)
+                                                  ? SIZE_MAX
+                                                  : 3 * look_ahead_size;
+                GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, trying LRU-style reclaim (target %.2f MiB) from %.2f MiB cached\n",
+                               device, look_ahead_size/1024.0/1024.0, reclaim_target/1024.0/1024.0, cached_bytes/1024.0/1024.0);
+
+                size_t freed = 0;
+                while (freed < reclaim_target) {
+                    int      victim = -1;
+                    uint64_t oldest = UINT64_MAX;
+                    for (int i = 0; i < MAX_BUFFERS; ++i) {
+                        ggml_cuda_buffer & b = buffer_pool[i];
+                        if (b.ptr != nullptr && buffer_pool_ts[i] < oldest) {
+                            victim = i;
+                            oldest = buffer_pool_ts[i];
+                        }
+                    }
+                    if (victim < 0) {
+                        break;
+                    }
+                    ggml_cuda_buffer & b = buffer_pool[victim];
+                    CUDA_CHECK(cudaFree(b.ptr));
+                    freed     += b.size;
+                    pool_size -= b.size;
+                    b.ptr  = nullptr;
+                    b.size = 0;
+                    buffer_pool_ts[victim] = 0;
                 }
-                ggml_cuda_buffer & b = buffer_pool[victim];
-                CUDA_CHECK(cudaFree(b.ptr));
-                freed     += b.size;
-                pool_size -= b.size;
-                b.ptr  = nullptr;
-                b.size = 0;
-                buffer_pool_ts[victim] = 0;
-            }
-
-            err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
 
-            // terminal fallback if the 3x-bounded reclaim left cached buffers behind
-            if (err == cudaErrorMemoryAllocation && pool_size > 0) {
-                (void)cudaGetLastError();
-                GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: reclaim bounded at %.2f MiB, flushing remaining %.2f MiB cached\n",
-                               device, freed/1024.0/1024.0, pool_size/1024.0/1024.0);
-                clear_pool();
                 err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
-            }
 
-            if (err == cudaSuccess) {
-                GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded after reclaiming %.2f MiB\n",
-                               device, freed/1024.0/1024.0);
+                // terminal fallback if the 3x-bounded reclaim left cached buffers behind
+                if (err == cudaErrorMemoryAllocation && pool_size > 0) {
+                    (void)cudaGetLastError();
+                    GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: reclaim bounded at %.2f MiB, flushing remaining %.2f MiB cached\n",
+                                   device, freed/1024.0/1024.0, pool_size/1024.0/1024.0);
+                    clear_pool();
+                    err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device);
+                }
+
+                if (err == cudaSuccess) {
+                    GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded after reclaiming %.2f MiB\n",
+                                   device, freed/1024.0/1024.0);
+                }
             }
         }
         CUDA_CHECK(err);