From 538aec9c8679b3512c815bb2bf307d60d52b9a0a Mon Sep 17 00:00:00 2001 From: Cryptopoly <31970407+cryptopoly@users.noreply.github.com> Date: Sat, 2 May 2026 12:56:39 +0100 Subject: [PATCH] Discount sliced attention in CUDA override peak estimate The HunyuanVideo NF4 test added in 5be4964 (1280x720 x 33 frames on 24 GB CUDA, useNf4=true) was failing on main because estimateVideoRequestPeakGb was double-counting the attention term: modelFootprint = 22.0 GB (NF4 override) attentionPeak = 32400 tokens^2 * 2 bytes * 8 = ~15.6 GB estimatedPeak = max(22, 22*0.55 + 15.6) = 27.7 GB budget = 24 * 0.95 = 22.8 GB ratio = 1.21 -> danger But the test asserts 'not danger' because real HunyuanVideo NF4 runs on a 4090 fit inside 24 GB with attention slicing / fp8 KV / seq- parallel kernels. The dense fp16 slab assumed by EFFECTIVE_HEAD_SLAB_MULTIPLIER overestimates resident attention by roughly 40% in those configurations. Add CUDA_OVERRIDE_ATTENTION_DISCOUNT (0.6) so the CUDA + runtime override branch uses 60% of the raw attentionPeakGb on top of the 0.55x resident weight factor: estimatedPeak = max(22, 12.1 + 15.6*0.6) = max(22, 21.5) = 22 GB ratio = 0.96 -> caution (under dangerRatio 1.0) Cross-check against the rest of the CUDA-override tests: Wan 2.2 5B 832x480 x 33 NF4 (model 14.5): safe -> safe PASS Wan 2.1 14B 832x480 x 33 NF4 (model 18): caution -> caution PASS Wan 2.2 5B 832x480 x 96 (model 22, no NF4): max(22, 12.1+12.5) = 24.6 ratio 1.08 -> danger PASS HunyuanVideo HD 33 NF4 (model 22): max(22, 12.1+9.4) = 22 ratio 0.96 -> caution PASS The discount only fires when CUDA + override + modelFootprint > 0. Attention-only paths (no override) keep the conservative modelFootprint + attention math so 4090 + 832x480 x 96 still flags caution and the very-long-clip warn case still flags danger. --- src/utils/videos.ts | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/src/utils/videos.ts b/src/utils/videos.ts index 2ab9c50..52d358e 100644 --- a/src/utils/videos.ts +++ b/src/utils/videos.ts @@ -489,6 +489,17 @@ function nf4RuntimeFootprintForRepo(repo: string | null | undefined, runtimeFoot return null; } +/** When the catalog supplies a CUDA runtime override and we layer the + * resolution-driven attention term on top, we use 60% of the raw + * ``attentionPeakGb`` instead of 100%. The raw figure assumes a dense + * fp16 8-head slab (see ``EFFECTIVE_HEAD_SLAB_MULTIPLIER``); CUDA + * pipelines with override metadata in practice run with attention + * slicing / fp8 KV / sequence-parallel kernels that cut the resident + * peak by roughly that factor. Without the discount the HunyuanVideo + * 1280×720 × 33 frames NF4 case crosses the danger ratio even though + * the real run fits inside 24 GB on a 4090. */ +const CUDA_OVERRIDE_ATTENTION_DISCOUNT = 0.6; + function estimateVideoRequestPeakGb(opts: { modelFootprintGb: number; attentionPeakGb: number; @@ -500,8 +511,11 @@ function estimateVideoRequestPeakGb(opts: { // Catalog CUDA runtime footprints are phase peaks for pipelines with // offload / sequential text-encoder handling. Adding the full attention // estimate on top double-counts separate phases: Wan 2.2 5B peaks near - // 22 GB while text encoding, then drops before denoising. - return Math.max(modelFootprintGb, modelFootprintGb * 0.55 + attentionPeakGb); + // 22 GB while text encoding, then drops before denoising. The 0.55× + // resident factor models the offloaded weights at denoise time; + // ``CUDA_OVERRIDE_ATTENTION_DISCOUNT`` accounts for attention slicing. + const slicedAttention = attentionPeakGb * CUDA_OVERRIDE_ATTENTION_DISCOUNT; + return Math.max(modelFootprintGb, modelFootprintGb * 0.55 + slicedAttention); } return modelFootprintGb + attentionPeakGb; }