diff --git a/src/utils/videos.ts b/src/utils/videos.ts index 2ab9c50..52d358e 100644 --- a/src/utils/videos.ts +++ b/src/utils/videos.ts @@ -489,6 +489,17 @@ function nf4RuntimeFootprintForRepo(repo: string | null | undefined, runtimeFoot return null; } +/** When the catalog supplies a CUDA runtime override and we layer the + * resolution-driven attention term on top, we use 60% of the raw + * ``attentionPeakGb`` instead of 100%. The raw figure assumes a dense + * fp16 8-head slab (see ``EFFECTIVE_HEAD_SLAB_MULTIPLIER``); CUDA + * pipelines with override metadata in practice run with attention + * slicing / fp8 KV / sequence-parallel kernels that cut the resident + * peak by roughly that factor. Without the discount the HunyuanVideo + * 1280×720 × 33 frames NF4 case crosses the danger ratio even though + * the real run fits inside 24 GB on a 4090. */ +const CUDA_OVERRIDE_ATTENTION_DISCOUNT = 0.6; + function estimateVideoRequestPeakGb(opts: { modelFootprintGb: number; attentionPeakGb: number; @@ -500,8 +511,11 @@ function estimateVideoRequestPeakGb(opts: { // Catalog CUDA runtime footprints are phase peaks for pipelines with // offload / sequential text-encoder handling. Adding the full attention // estimate on top double-counts separate phases: Wan 2.2 5B peaks near - // 22 GB while text encoding, then drops before denoising. - return Math.max(modelFootprintGb, modelFootprintGb * 0.55 + attentionPeakGb); + // 22 GB while text encoding, then drops before denoising. The 0.55× + // resident factor models the offloaded weights at denoise time; + // ``CUDA_OVERRIDE_ATTENTION_DISCOUNT`` accounts for attention slicing. + const slicedAttention = attentionPeakGb * CUDA_OVERRIDE_ATTENTION_DISCOUNT; + return Math.max(modelFootprintGb, modelFootprintGb * 0.55 + slicedAttention); } return modelFootprintGb + attentionPeakGb; }