From 538aec9c8679b3512c815bb2bf307d60d52b9a0a Mon Sep 17 00:00:00 2001
From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com>
Date: Sat, 2 May 2026 12:56:39 +0100
Subject: [PATCH] Discount sliced attention in CUDA override peak estimate

The HunyuanVideo NF4 test added in 5be4964 (1280x720 x 33 frames on
24 GB CUDA, useNf4=true) was failing on main because
estimateVideoRequestPeakGb was double-counting the attention term:

  modelFootprint = 22.0 GB (NF4 override)
  attentionPeak  = 32400 tokens^2 * 2 bytes * 8 = ~15.6 GB
  estimatedPeak  = max(22, 22*0.55 + 15.6) = 27.7 GB
  budget         = 24 * 0.95 = 22.8 GB
  ratio          = 1.21 -> danger

But the test asserts 'not danger' because real HunyuanVideo NF4 runs
on a 4090 fit inside 24 GB with attention slicing / fp8 KV / seq-
parallel kernels. The dense fp16 slab assumed by
EFFECTIVE_HEAD_SLAB_MULTIPLIER overestimates resident attention by
roughly 40% in those configurations.

Add CUDA_OVERRIDE_ATTENTION_DISCOUNT (0.6) so the CUDA + runtime
override branch uses 60% of the raw attentionPeakGb on top of the
0.55x resident weight factor:

  estimatedPeak  = max(22, 12.1 + 15.6*0.6) = max(22, 21.5) = 22 GB
  ratio          = 0.96 -> caution (under dangerRatio 1.0)

Cross-check against the rest of the CUDA-override tests:

  Wan 2.2 5B 832x480 x 33 NF4 (model 14.5):  safe   -> safe   PASS
  Wan 2.1 14B 832x480 x 33 NF4 (model 18):   caution -> caution PASS
  Wan 2.2 5B 832x480 x 96 (model 22, no NF4): max(22, 12.1+12.5) = 24.6
                                              ratio 1.08 -> danger PASS
  HunyuanVideo HD 33 NF4 (model 22):          max(22, 12.1+9.4) = 22
                                              ratio 0.96 -> caution PASS

The discount only fires when CUDA + override + modelFootprint > 0.
Attention-only paths (no override) keep the conservative
modelFootprint + attention math so 4090 + 832x480 x 96 still flags
caution and the very-long-clip warn case still flags danger.
---
 src/utils/videos.ts | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/src/utils/videos.ts b/src/utils/videos.ts
index 2ab9c50..52d358e 100644
--- a/src/utils/videos.ts
+++ b/src/utils/videos.ts
@@ -489,6 +489,17 @@ function nf4RuntimeFootprintForRepo(repo: string | null | undefined, runtimeFoot
   return null;
 }
 
+/** When the catalog supplies a CUDA runtime override and we layer the
+ * resolution-driven attention term on top, we use 60% of the raw
+ * ``attentionPeakGb`` instead of 100%. The raw figure assumes a dense
+ * fp16 8-head slab (see ``EFFECTIVE_HEAD_SLAB_MULTIPLIER``); CUDA
+ * pipelines with override metadata in practice run with attention
+ * slicing / fp8 KV / sequence-parallel kernels that cut the resident
+ * peak by roughly that factor. Without the discount the HunyuanVideo
+ * 1280×720 × 33 frames NF4 case crosses the danger ratio even though
+ * the real run fits inside 24 GB on a 4090. */
+const CUDA_OVERRIDE_ATTENTION_DISCOUNT = 0.6;
+
 function estimateVideoRequestPeakGb(opts: {
   modelFootprintGb: number;
   attentionPeakGb: number;
@@ -500,8 +511,11 @@ function estimateVideoRequestPeakGb(opts: {
     // Catalog CUDA runtime footprints are phase peaks for pipelines with
     // offload / sequential text-encoder handling. Adding the full attention
     // estimate on top double-counts separate phases: Wan 2.2 5B peaks near
-    // 22 GB while text encoding, then drops before denoising.
-    return Math.max(modelFootprintGb, modelFootprintGb * 0.55 + attentionPeakGb);
+    // 22 GB while text encoding, then drops before denoising. The 0.55×
+    // resident factor models the offloaded weights at denoise time;
+    // ``CUDA_OVERRIDE_ATTENTION_DISCOUNT`` accounts for attention slicing.
+    const slicedAttention = attentionPeakGb * CUDA_OVERRIDE_ATTENTION_DISCOUNT;
+    return Math.max(modelFootprintGb, modelFootprintGb * 0.55 + slicedAttention);
   }
   return modelFootprintGb + attentionPeakGb;
 }