From 9000cfad0c7ee4932c2def1bc35f38e563c3a310 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=81=E5=8E=9A=E5=AE=8F?= <2695316095@qq.com> Date: Tue, 21 Apr 2026 11:31:38 +0800 Subject: [PATCH 1/3] cuda: add partial eviction on pool OOM --- ggml/src/ggml-cuda/ggml-cuda.cu | 39 +++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index 185956317e0..b4504e34773 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -428,16 +428,47 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { look_ahead_size = 256 * ((look_ahead_size + 255)/256); ggml_cuda_set_device(device); cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device); - if (err == cudaErrorMemoryAllocation) { + if (err == cudaErrorMemoryAllocation && pool_size > 0) { (void)cudaGetLastError(); const size_t cached_bytes = pool_size; - GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, flushing %.2f MiB of cached buffers and retrying\n", + GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, trying partial eviction from %.2f MiB cached\n", device, look_ahead_size/1024.0/1024.0, cached_bytes/1024.0/1024.0); CUDA_CHECK(cudaDeviceSynchronize()); - clear_pool(); + + size_t freed = 0; + while (freed < look_ahead_size) { + int victim = -1; + size_t victim_size = 0; + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cuda_buffer & b = buffer_pool[i]; + if (b.ptr != nullptr && b.size > victim_size) { + victim = i; + victim_size = b.size; + } + } + if (victim < 0) { + break; + } + ggml_cuda_buffer & b = buffer_pool[victim]; + CUDA_CHECK(cudaFree(b.ptr)); + freed += b.size; + pool_size -= b.size; + b.ptr = nullptr; + b.size = 0; + } + err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device); + + if (err == cudaErrorMemoryAllocation && pool_size > 0) { + (void)cudaGetLastError(); + GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: partial eviction freed %.2f MiB, flushing remaining %.2f MiB cached\n", + device, freed/1024.0/1024.0, pool_size/1024.0/1024.0); + clear_pool(); + err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device); + } + if (err == cudaSuccess) { - GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded\n", device); + GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded after eviction\n", device); } } CUDA_CHECK(err); From d3e3a69721cef45e4c235cc8a9f018733a0203a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=81=E5=8E=9A=E5=AE=8F?= <2695316095@qq.com> Date: Tue, 21 Apr 2026 20:35:30 +0800 Subject: [PATCH 2/3] cuda: LRU reclaim + overallocate for legacy pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 梁厚宏 <2695316095@qq.com> --- ggml/src/ggml-cuda/common.cuh | 10 +++--- ggml/src/ggml-cuda/fattn-common.cuh | 4 +-- ggml/src/ggml-cuda/ggml-cuda.cu | 53 +++++++++++++++++++++-------- 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh index 3aec1742ee1..3a41c8bb477 100644 --- a/ggml/src/ggml-cuda/common.cuh +++ b/ggml/src/ggml-cuda/common.cuh @@ -1105,7 +1105,7 @@ int ggml_cuda_get_device(); struct ggml_cuda_pool { virtual ~ggml_cuda_pool() = default; - virtual void * alloc(size_t size, size_t * actual_size) = 0; + virtual void * alloc(size_t size, size_t * actual_size, bool overallocate = false) = 0; virtual void free(void * ptr, size_t size) = 0; }; @@ -1131,16 +1131,16 @@ struct ggml_cuda_pool_alloc { } // size is in number of elements - T * alloc(size_t size) { + T * alloc(size_t size, bool overallocate = false) { GGML_ASSERT(pool != nullptr); GGML_ASSERT(ptr == nullptr); - ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size); + ptr = (T *) pool->alloc(size * sizeof(T), &this->actual_size, overallocate); return ptr; } - T * alloc(ggml_cuda_pool & pool, size_t size) { + T * alloc(ggml_cuda_pool & pool, size_t size, bool overallocate = false) { this->pool = &pool; - return alloc(size); + return alloc(size, overallocate); } T * get() { diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh index beeb5238946..c17440f9b89 100644 --- a/ggml/src/ggml-cuda/fattn-common.cuh +++ b/ggml/src/ggml-cuda/fattn-common.cuh @@ -966,7 +966,7 @@ void launch_fattn( const size_t bs = ggml_blck_size(K->type); const size_t ts = ggml_type_size(K->type); - K_f16.alloc(ggml_nelements(K)); + K_f16.alloc(ggml_nelements(K), /*overallocate=*/ true); if (ggml_is_contiguously_allocated(K)) { to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(K->type); to_fp16(K_data, K_f16.ptr, ggml_nelements(K), main_stream); @@ -999,7 +999,7 @@ void launch_fattn( const size_t bs = ggml_blck_size(V->type); const size_t ts = ggml_type_size(V->type); - V_f16.alloc(ggml_nelements(V)); + V_f16.alloc(ggml_nelements(V), /*overallocate=*/ true); if (ggml_is_contiguously_allocated(V)) { to_fp16_cuda_t to_fp16 = ggml_get_to_fp16_cuda(V->type); to_fp16(V_data, V_f16.ptr, ggml_nelements(V), main_stream); diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index b4504e34773..d639871212b 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -363,6 +363,10 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { ggml_cuda_buffer buffer_pool[MAX_BUFFERS] = {}; size_t pool_size = 0; + // per-slot timestamp stamped on free(); older = longer uncollected, used for LRU-style reclaim on OOM + uint64_t clock = 0; + uint64_t buffer_pool_ts[MAX_BUFFERS] = {}; + explicit ggml_cuda_pool_leg(int device) : device(device) { } @@ -381,11 +385,12 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { pool_size -= b.size; b.ptr = nullptr; b.size = 0; + buffer_pool_ts[i] = 0; } } } - void * alloc(size_t size, size_t * actual_size) override { + void * alloc(size_t size, size_t * actual_size, bool overallocate) override { #ifdef DEBUG_CUDA_MALLOC int nnz = 0; size_t max_size = 0; @@ -409,6 +414,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { *actual_size = b.size; b.ptr = nullptr; b.size = 0; + buffer_pool_ts[i] = 0; return ptr; } } @@ -421,29 +427,41 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { *actual_size = b.size; b.ptr = nullptr; b.size = 0; + buffer_pool_ts[ibest] = 0; return ptr; } void * ptr; - size_t look_ahead_size = (size_t) (1.05 * size); - look_ahead_size = 256 * ((look_ahead_size + 255)/256); + size_t look_ahead_size; + if (overallocate) { + look_ahead_size = (size > SIZE_MAX / 2) ? SIZE_MAX : 2 * size; + } else { + look_ahead_size = (size_t) (1.05 * size); + } + // 256-byte align, overflow-safe + look_ahead_size = (look_ahead_size > SIZE_MAX - 255) + ? (SIZE_MAX & ~size_t(255)) + : 256 * ((look_ahead_size + 255)/256); ggml_cuda_set_device(device); cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device); if (err == cudaErrorMemoryAllocation && pool_size > 0) { (void)cudaGetLastError(); - const size_t cached_bytes = pool_size; - GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, trying partial eviction from %.2f MiB cached\n", - device, look_ahead_size/1024.0/1024.0, cached_bytes/1024.0/1024.0); + const size_t cached_bytes = pool_size; + const size_t reclaim_target = (look_ahead_size > SIZE_MAX / 3) + ? SIZE_MAX + : 3 * look_ahead_size; + GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, trying LRU-style reclaim (target %.2f MiB) from %.2f MiB cached\n", + device, look_ahead_size/1024.0/1024.0, reclaim_target/1024.0/1024.0, cached_bytes/1024.0/1024.0); CUDA_CHECK(cudaDeviceSynchronize()); size_t freed = 0; - while (freed < look_ahead_size) { - int victim = -1; - size_t victim_size = 0; + while (freed < reclaim_target) { + int victim = -1; + uint64_t oldest = UINT64_MAX; for (int i = 0; i < MAX_BUFFERS; ++i) { ggml_cuda_buffer & b = buffer_pool[i]; - if (b.ptr != nullptr && b.size > victim_size) { - victim = i; - victim_size = b.size; + if (b.ptr != nullptr && buffer_pool_ts[i] < oldest) { + victim = i; + oldest = buffer_pool_ts[i]; } } if (victim < 0) { @@ -455,20 +473,23 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { pool_size -= b.size; b.ptr = nullptr; b.size = 0; + buffer_pool_ts[victim] = 0; } err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device); + // terminal fallback if the 3x-bounded reclaim left cached buffers behind if (err == cudaErrorMemoryAllocation && pool_size > 0) { (void)cudaGetLastError(); - GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: partial eviction freed %.2f MiB, flushing remaining %.2f MiB cached\n", + GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: reclaim bounded at %.2f MiB, flushing remaining %.2f MiB cached\n", device, freed/1024.0/1024.0, pool_size/1024.0/1024.0); clear_pool(); err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device); } if (err == cudaSuccess) { - GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded after eviction\n", device); + GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded after reclaiming %.2f MiB\n", + device, freed/1024.0/1024.0); } } CUDA_CHECK(err); @@ -487,6 +508,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { if (b.ptr == nullptr) { b.ptr = ptr; b.size = size; + buffer_pool_ts[i] = ++clock; return; } } @@ -530,7 +552,8 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool { } } - void * alloc(size_t size, size_t * actual_size) override { + void * alloc(size_t size, size_t * actual_size, [[maybe_unused]] bool overallocate) override { + // overallocate hint is ignored: VMM growth is already granularity-based // round up the allocation size to the alignment to ensure that all allocations are aligned for all data types const size_t alignment = 128; size = alignment * ((size + alignment - 1) / alignment); From 00afd1b04e5e168b002ef72a67aab965cb797204 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A2=81=E5=8E=9A=E5=AE=8F?= <2695316095@qq.com> Date: Thu, 23 Apr 2026 23:13:10 +0800 Subject: [PATCH 3/3] cuda: add HIP multi-GPU guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: 梁厚宏 <2695316095@qq.com> --- ggml/src/ggml-cuda/ggml-cuda.cu | 91 +++++++++++++++++++-------------- 1 file changed, 53 insertions(+), 38 deletions(-) diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu index d639871212b..5e49449c84d 100644 --- a/ggml/src/ggml-cuda/ggml-cuda.cu +++ b/ggml/src/ggml-cuda/ggml-cuda.cu @@ -445,51 +445,66 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool { cudaError_t err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device); if (err == cudaErrorMemoryAllocation && pool_size > 0) { (void)cudaGetLastError(); - const size_t cached_bytes = pool_size; - const size_t reclaim_target = (look_ahead_size > SIZE_MAX / 3) - ? SIZE_MAX - : 3 * look_ahead_size; - GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, trying LRU-style reclaim (target %.2f MiB) from %.2f MiB cached\n", - device, look_ahead_size/1024.0/1024.0, reclaim_target/1024.0/1024.0, cached_bytes/1024.0/1024.0); CUDA_CHECK(cudaDeviceSynchronize()); - size_t freed = 0; - while (freed < reclaim_target) { - int victim = -1; - uint64_t oldest = UINT64_MAX; - for (int i = 0; i < MAX_BUFFERS; ++i) { - ggml_cuda_buffer & b = buffer_pool[i]; - if (b.ptr != nullptr && buffer_pool_ts[i] < oldest) { - victim = i; - oldest = buffer_pool_ts[i]; - } +#if defined(GGML_USE_HIP) + // HIP multi-GPU: LRU timing amplifies ROCm/rocm-systems#4817; fall back to clear_pool. + if (ggml_backend_cuda_get_device_count() > 1) { + GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: HIP multi-GPU, flushing %.2f MiB of cached buffers and retrying\n", + device, pool_size/1024.0/1024.0); + clear_pool(); + err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device); + if (err == cudaSuccess) { + GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded\n", device); } - if (victim < 0) { - break; + } else +#endif + { + const size_t cached_bytes = pool_size; + const size_t reclaim_target = (look_ahead_size > SIZE_MAX / 3) + ? SIZE_MAX + : 3 * look_ahead_size; + GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: alloc of %.2f MiB failed, trying LRU-style reclaim (target %.2f MiB) from %.2f MiB cached\n", + device, look_ahead_size/1024.0/1024.0, reclaim_target/1024.0/1024.0, cached_bytes/1024.0/1024.0); + + size_t freed = 0; + while (freed < reclaim_target) { + int victim = -1; + uint64_t oldest = UINT64_MAX; + for (int i = 0; i < MAX_BUFFERS; ++i) { + ggml_cuda_buffer & b = buffer_pool[i]; + if (b.ptr != nullptr && buffer_pool_ts[i] < oldest) { + victim = i; + oldest = buffer_pool_ts[i]; + } + } + if (victim < 0) { + break; + } + ggml_cuda_buffer & b = buffer_pool[victim]; + CUDA_CHECK(cudaFree(b.ptr)); + freed += b.size; + pool_size -= b.size; + b.ptr = nullptr; + b.size = 0; + buffer_pool_ts[victim] = 0; } - ggml_cuda_buffer & b = buffer_pool[victim]; - CUDA_CHECK(cudaFree(b.ptr)); - freed += b.size; - pool_size -= b.size; - b.ptr = nullptr; - b.size = 0; - buffer_pool_ts[victim] = 0; - } - - err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device); - // terminal fallback if the 3x-bounded reclaim left cached buffers behind - if (err == cudaErrorMemoryAllocation && pool_size > 0) { - (void)cudaGetLastError(); - GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: reclaim bounded at %.2f MiB, flushing remaining %.2f MiB cached\n", - device, freed/1024.0/1024.0, pool_size/1024.0/1024.0); - clear_pool(); err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device); - } - if (err == cudaSuccess) { - GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded after reclaiming %.2f MiB\n", - device, freed/1024.0/1024.0); + // terminal fallback if the 3x-bounded reclaim left cached buffers behind + if (err == cudaErrorMemoryAllocation && pool_size > 0) { + (void)cudaGetLastError(); + GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: reclaim bounded at %.2f MiB, flushing remaining %.2f MiB cached\n", + device, freed/1024.0/1024.0, pool_size/1024.0/1024.0); + clear_pool(); + err = ggml_cuda_device_malloc(&ptr, look_ahead_size, device); + } + + if (err == cudaSuccess) { + GGML_LOG_DEBUG(GGML_CUDA_NAME " pool[%d]: retry succeeded after reclaiming %.2f MiB\n", + device, freed/1024.0/1024.0); + } } } CUDA_CHECK(err);