From 1d8ac33bdbdf92d2e63a8a7e824141eca9ef3250 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Thu, 2 Apr 2026 06:02:21 +0000
Subject: [PATCH 01/21] Add the Skip softmax diffusion

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 .../diffusers/sparsity/ltx2_skip_softmax.py   | 397 ++++++++++++++++++
 .../diffusers/sparsity/wan22_skip_softmax.py  | 268 ++++++++++++
 .../calibration/calibrate.py                  |  25 +-
 .../sparsity/attention_sparsity/conversion.py |  89 +++-
 .../attention_sparsity/kernels/__init__.py    |  50 ++-
 .../kernels/diffusers_eager_attention.py      | 147 +++++++
 .../kernels/diffusers_triton_attention.py     | 160 +++++++
 .../kernels/ltx_eager_attention.py            | 114 +++++
 .../kernels/ltx_triton_attention.py           | 148 +++++++
 .../methods/flash_skip_softmax.py             |  23 +-
 .../methods/triton_skip_softmax.py            |  14 +
 .../attention_sparsity/plugins/huggingface.py |   9 +
 .../attention_sparsity/stats_manager.py       |  17 +-
 13 files changed, 1429 insertions(+), 32 deletions(-)
 create mode 100644 examples/diffusers/sparsity/ltx2_skip_softmax.py
 create mode 100644 examples/diffusers/sparsity/wan22_skip_softmax.py
 create mode 100644 modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_eager_attention.py
 create mode 100644 modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
 create mode 100644 modelopt/torch/sparsity/attention_sparsity/kernels/ltx_eager_attention.py
 create mode 100644 modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py

diff --git a/examples/diffusers/sparsity/ltx2_skip_softmax.py b/examples/diffusers/sparsity/ltx2_skip_softmax.py
new file mode 100644
index 0000000000..dae064e070
--- /dev/null
+++ b/examples/diffusers/sparsity/ltx2_skip_softmax.py
@@ -0,0 +1,397 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""LTX-2 inference with skip-softmax sparse attention.
+
+This example applies skip-softmax sparse attention to the LTX-2 video
+generation model using exponential model calibration
+(``scale_factor = a * exp(b * target_sparsity)``).
+
+During calibration, ``flash_skip_softmax`` with the eager attention backend
+collects sparsity statistics across multiple threshold trials. The fitted
+exponential model then allows runtime control of the target sparsity ratio
+without recalibration.
+
+Only the stage-1 backbone is sparsified.  Stage 2 (spatial upsampler +
+distilled LoRA) runs unmodified.
+
+Usage::
+
+    # With calibration (recommended)
+    python ltx2_skip_softmax.py --prompt "A cat playing piano" --output out.mp4 \\
+        --calibrate --target-sparsity 0.25
+
+    # Disable sparsity on first/last 2 layers (higher quality, less speedup)
+    python ltx2_skip_softmax.py --prompt "A cat playing piano" --output out.mp4 \\
+        --calibrate --target-sparsity 0.25 --skip-first-last 2
+"""
+
+import argparse
+import functools
+import os
+
+import torch
+from ltx_core.loader import LTXV_LORA_COMFY_RENAMING_MAP, LoraPathStrengthAndSDOps
+from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
+from ltx_pipelines.ti2vid_two_stages import TI2VidTwoStagesPipeline
+from ltx_pipelines.utils.constants import (
+    AUDIO_SAMPLE_RATE,
+    DEFAULT_2_STAGE_HEIGHT,
+    DEFAULT_2_STAGE_WIDTH,
+    DEFAULT_AUDIO_GUIDER_PARAMS,
+    DEFAULT_FRAME_RATE,
+    DEFAULT_NEGATIVE_PROMPT,
+    DEFAULT_NUM_INFERENCE_STEPS,
+    DEFAULT_SEED,
+    DEFAULT_VIDEO_GUIDER_PARAMS,
+)
+from ltx_pipelines.utils.media_io import encode_video
+
+import modelopt.torch.sparsity.attention_sparsity as mtsa
+from modelopt.torch.sparsity.attention_sparsity.sparse_attention import SparseAttentionModule
+
+# ---- Model paths (edit these or override via environment variables) ----
+CHECKPOINT_PATH = os.environ.get(
+    "LTX2_CHECKPOINT",
+    "/home/scratch.omniml_data_2/jingyux/models/LTX-2/ltx-2-19b-dev.safetensors",
+)
+DISTILLED_LORA_PATH = os.environ.get(
+    "LTX2_DISTILLED_LORA",
+    "/home/scratch.omniml_data_2/jingyux/models/LTX-2/ltx-2-19b-distilled-lora-384.safetensors",
+)
+SPATIAL_UPSAMPLER_PATH = os.environ.get(
+    "LTX2_SPATIAL_UPSAMPLER",
+    "/home/scratch.omniml_data_2/jingyux/models/LTX-2/ltx-2-spatial-upscaler-x2-1.0.safetensors",
+)
+GEMMA_ROOT = os.environ.get(
+    "LTX2_GEMMA_ROOT",
+    "/home/scratch.omniml_data_2/jingyux/models/LTX-2/gemma-3-12b-it-qat-q4_0-unquantized",
+)
+
+DEFAULT_NUM_FRAMES = 121
+NUM_TRANSFORMER_BLOCKS = 48
+
+# Default threshold trials for calibration
+DEFAULT_THRESHOLD_TRIALS = [
+    1e-6,
+    5e-6,
+    1e-5,
+    5e-5,
+    1e-4,
+    5e-4,
+    1e-3,
+    5e-3,
+    1e-2,
+    2e-2,
+    5e-2,
+    1e-1,
+    2e-1,
+    3e-1,
+    5e-1,
+    7e-1,
+]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="LTX-2 video generation with skip-softmax sparse attention"
+    )
+    parser.add_argument("--prompt", type=str, default=None, help="Text prompt for generation")
+    parser.add_argument(
+        "--prompt-dir",
+        type=str,
+        default=None,
+        help="Directory of .txt prompt files (one prompt per file). Overrides --prompt.",
+    )
+    parser.add_argument("--output", type=str, default="output.mp4", help="Output video path")
+    parser.add_argument(
+        "--output-dir",
+        type=str,
+        default=None,
+        help="Directory to save videos when using --prompt-dir",
+    )
+    parser.add_argument(
+        "--num-frames", type=int, default=DEFAULT_NUM_FRAMES, help="Number of frames"
+    )
+    parser.add_argument("--height", type=int, default=DEFAULT_2_STAGE_HEIGHT, help="Video height")
+    parser.add_argument("--width", type=int, default=DEFAULT_2_STAGE_WIDTH, help="Video width")
+    parser.add_argument("--seed", type=int, default=DEFAULT_SEED, help="Random seed")
+
+    # Sparse attention options
+    parser.add_argument(
+        "--skip-first-last",
+        type=int,
+        default=0,
+        help="Number of first/last transformer layers to keep dense (default: 0)",
+    )
+
+    # Calibration options
+    parser.add_argument(
+        "--calibrate",
+        action="store_true",
+        help="Calibrate threshold via exponential model (recommended)",
+    )
+    parser.add_argument(
+        "--target-sparsity",
+        type=float,
+        default=0.25,
+        help="Target sparsity ratio for calibration (0.0-1.0)",
+    )
+    parser.add_argument(
+        "--calib-steps",
+        type=int,
+        default=10,
+        help="Inference steps per calibration sample",
+    )
+    parser.add_argument(
+        "--calib-frames",
+        type=int,
+        default=81,
+        help="Number of frames per calibration sample",
+    )
+    parser.add_argument(
+        "--calib-size",
+        type=int,
+        default=1,
+        help="Number of prompts to use for calibration",
+    )
+    return parser.parse_args()
+
+
+def _patch_vae_requires_grad(pipeline: TI2VidTwoStagesPipeline):
+    """Ensure VAE decoder weights have requires_grad=False to avoid autograd issues."""
+    for ledger_attr in ("stage_1_model_ledger", "stage_2_model_ledger"):
+        ledger = getattr(pipeline, ledger_attr, None)
+        if ledger is None:
+            continue
+        for loader_name in ("video_decoder", "audio_decoder"):
+            orig_loader = getattr(ledger, loader_name, None)
+            if orig_loader is None:
+                continue
+
+            def _make_patched(fn):
+                @functools.wraps(fn)
+                def patched():
+                    model = fn()
+                    model.requires_grad_(False)
+                    return model
+
+                return patched
+
+            setattr(ledger, loader_name, _make_patched(orig_loader))
+
+
+def build_pipeline() -> TI2VidTwoStagesPipeline:
+    """Build the LTX-2 two-stage video generation pipeline."""
+    pipeline = TI2VidTwoStagesPipeline(
+        checkpoint_path=CHECKPOINT_PATH,
+        distilled_lora=[
+            LoraPathStrengthAndSDOps(DISTILLED_LORA_PATH, 0.8, LTXV_LORA_COMFY_RENAMING_MAP)
+        ],
+        spatial_upsampler_path=SPATIAL_UPSAMPLER_PATH,
+        gemma_root=GEMMA_ROOT,
+        loras=[],
+    )
+    _patch_vae_requires_grad(pipeline)
+    return pipeline
+
+
+def build_sparse_config(args: argparse.Namespace) -> dict:
+    """Build sparse attention config from CLI args.
+
+    Uses flash_skip_softmax which supports both calibration (eager attention
+    with F.softmax patching) and inference. Calibration fits an exponential
+    model: scale_factor = a * exp(b * sparsity).
+    """
+    attn_cfg: dict = {
+        "method": "flash_skip_softmax",
+        "thresholds": {"prefill": [1e-3]},
+        "br": 128,
+        "bc": 128,
+        "backend": "pytorch",
+        "is_causal": False,  # Diffusion = bidirectional attention
+        "collect_stats": True,
+        "enable": True,
+    }
+
+    sparse_cfg: dict = {
+        "*.attn1": attn_cfg,  # Self-attention only
+        # Disable on all cross-attention and cross-modal attention
+        "*.attn2": {"enable": False},
+        "*audio_attn1*": {"enable": False},
+        "*audio_attn2*": {"enable": False},
+        "*audio_to_video_attn*": {"enable": False},
+        "*video_to_audio_attn*": {"enable": False},
+        "default": {"enable": False},
+    }
+
+    # Keep first/last N layers dense for quality
+    for i in range(args.skip_first_last):
+        sparse_cfg[f"*transformer_blocks.{i}.attn*"] = {"enable": False}
+        sparse_cfg[f"*transformer_blocks.{NUM_TRANSFORMER_BLOCKS - 1 - i}.attn*"] = {
+            "enable": False
+        }
+
+    config: dict = {"sparse_cfg": sparse_cfg}
+
+    # Add calibration config with threshold trials
+    if args.calibrate:
+        sparse_cfg["calibration"] = {
+            "target_sparse_ratio": {"prefill": args.target_sparsity},
+            "samples": args.calib_size,
+            "threshold_trials": DEFAULT_THRESHOLD_TRIALS,
+        }
+
+    return config
+
+
+def load_calib_prompts(calib_size: int) -> list[str]:
+    """Load calibration prompts from OpenVid-1M dataset."""
+    from datasets import load_dataset
+
+    dataset = load_dataset("nkp37/OpenVid-1M")
+    prompts = list(dataset["train"]["caption"][:calib_size])
+    print(f"Loaded {len(prompts)} calibration prompts from OpenVid-1M")
+    return prompts
+
+
+def build_calibration_forward_loop(
+    pipeline: TI2VidTwoStagesPipeline,
+    num_steps: int = 10,
+    num_frames: int = 81,
+    calib_size: int = 1,
+):
+    """Build a forward loop for exponential model calibration.
+
+    Generates short videos to exercise the attention mechanism at various
+    threshold trials, collecting sparsity statistics for the exponential fit.
+    """
+    calib_prompts = load_calib_prompts(calib_size)
+    tiling_config = TilingConfig.default()
+
+    def forward_loop(model):
+        for i, prompt in enumerate(calib_prompts):
+            print(f"Calibration [{i + 1}/{len(calib_prompts)}]: {prompt[:60]}...")
+            pipeline(
+                prompt=prompt,
+                negative_prompt=DEFAULT_NEGATIVE_PROMPT,
+                seed=DEFAULT_SEED,
+                height=DEFAULT_2_STAGE_HEIGHT,
+                width=DEFAULT_2_STAGE_WIDTH,
+                num_frames=num_frames,
+                frame_rate=DEFAULT_FRAME_RATE,
+                num_inference_steps=num_steps,
+                video_guider_params=DEFAULT_VIDEO_GUIDER_PARAMS,
+                audio_guider_params=DEFAULT_AUDIO_GUIDER_PARAMS,
+                images=[],
+                tiling_config=tiling_config,
+            )
+
+    return forward_loop
+
+
+def print_sparsity_summary(transformer: torch.nn.Module) -> None:
+    """Print per-module sparsity statistics."""
+    enabled, disabled = [], []
+    for name, module in transformer.named_modules():
+        if isinstance(module, SparseAttentionModule):
+            if module.is_enabled:
+                enabled.append((name, module))
+            else:
+                disabled.append(name)
+
+    print(f"\nSparse attention: {len(enabled)} enabled, {len(disabled)} disabled")
+    for name, module in enabled:
+        info = module.get_threshold_info()
+        print(f"  {name}: {info}")
+
+
+def main() -> None:
+    args = parse_args()
+
+    # ---- Build pipeline ----
+    print("Building LTX-2 pipeline...")
+    pipeline = build_pipeline()
+
+    # ---- Get and sparsify the stage-1 transformer ----
+    transformer = pipeline.stage_1_model_ledger.transformer()
+    # Pin transformer in memory so pipeline reuses the sparsified version
+    pipeline.stage_1_model_ledger.transformer = lambda: transformer
+
+    config = build_sparse_config(args)
+    forward_loop = None
+    if args.calibrate:
+        forward_loop = build_calibration_forward_loop(
+            pipeline,
+            num_steps=args.calib_steps,
+            num_frames=args.calib_frames,
+            calib_size=args.calib_size,
+        )
+
+    print("Applying skip-softmax sparse attention...")
+    mtsa.sparsify(transformer, config, forward_loop=forward_loop)
+
+    # ---- Build prompt list ----
+    prompts_and_outputs: list[tuple[str, str]] = []
+    if args.prompt_dir:
+        output_dir = args.output_dir or "output_videos"
+        os.makedirs(output_dir, exist_ok=True)
+        prompt_files = sorted(f for f in os.listdir(args.prompt_dir) if f.endswith(".txt"))
+        for pf in prompt_files:
+            with open(os.path.join(args.prompt_dir, pf)) as f:
+                prompt = f.read().strip()
+            stem = os.path.splitext(pf)[0]
+            prompts_and_outputs.append((prompt, os.path.join(output_dir, f"{stem}.mp4")))
+    elif args.prompt:
+        prompts_and_outputs.append((args.prompt, args.output))
+    else:
+        raise ValueError("Either --prompt or --prompt-dir must be provided")
+
+    # ---- Generate ----
+    tiling_config = TilingConfig.default()
+    for i, (prompt, output_path) in enumerate(prompts_and_outputs):
+        print(f"\nGenerating [{i + 1}/{len(prompts_and_outputs)}]: {prompt[:80]}...")
+
+        video, audio = pipeline(
+            prompt=prompt,
+            negative_prompt=DEFAULT_NEGATIVE_PROMPT,
+            seed=args.seed,
+            height=args.height,
+            width=args.width,
+            num_frames=args.num_frames,
+            frame_rate=DEFAULT_FRAME_RATE,
+            num_inference_steps=DEFAULT_NUM_INFERENCE_STEPS,
+            video_guider_params=DEFAULT_VIDEO_GUIDER_PARAMS,
+            audio_guider_params=DEFAULT_AUDIO_GUIDER_PARAMS,
+            images=[],
+            tiling_config=tiling_config,
+        )
+
+        encode_video(
+            video=video,
+            fps=DEFAULT_FRAME_RATE,
+            audio=audio,
+            audio_sample_rate=AUDIO_SAMPLE_RATE,
+            output_path=output_path,
+            video_chunks_number=get_video_chunks_number(args.num_frames, tiling_config),
+        )
+        print(f"Saved to {output_path}")
+
+    # ---- Print stats ----
+    print_sparsity_summary(transformer)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/diffusers/sparsity/wan22_skip_softmax.py b/examples/diffusers/sparsity/wan22_skip_softmax.py
new file mode 100644
index 0000000000..ac2031a6d7
--- /dev/null
+++ b/examples/diffusers/sparsity/wan22_skip_softmax.py
@@ -0,0 +1,268 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Wan 2.2 inference with skip-softmax sparse attention.
+
+This example applies skip-softmax sparse attention to the Wan 2.2 video
+generation model (text-to-video) using exponential model calibration
+(``scale_factor = a * exp(b * target_sparsity)``).
+
+During calibration, ``flash_skip_softmax`` with the eager attention backend
+collects sparsity statistics across multiple threshold trials. The fitted
+exponential model then allows runtime control of the target sparsity ratio
+without recalibration.
+
+The Wan 2.2 5B model has 40 transformer blocks with self-attention (attn1)
+and cross-attention (attn2). Only self-attention is sparsified.
+
+Usage::
+
+    # With calibration (recommended)
+    python wan22_skip_softmax.py --prompt "A cat playing piano" --output out.mp4 \\
+        --calibrate --target-sparsity 0.25
+
+    # Custom model path
+    python wan22_skip_softmax.py --model-path /path/to/Wan2.2-T2V-5B \\
+        --prompt "A sunset over mountains" --output sunset.mp4 --calibrate
+"""
+
+import argparse
+import os
+
+import torch
+from diffusers import AutoencoderKLWan, WanPipeline
+from diffusers.utils import export_to_video
+
+import modelopt.torch.sparsity.attention_sparsity as mtsa
+from modelopt.torch.sparsity.attention_sparsity.sparse_attention import SparseAttentionModule
+
+DEFAULT_MODEL_PATH = os.environ.get("WAN22_MODEL_PATH", "Wan-AI/Wan2.2-T2V-5B")
+NUM_TRANSFORMER_BLOCKS = 40
+
+# Default threshold trials for calibration
+DEFAULT_THRESHOLD_TRIALS = [
+    1e-6,
+    5e-6,
+    1e-5,
+    5e-5,
+    1e-4,
+    5e-4,
+    1e-3,
+    5e-3,
+    1e-2,
+    2e-2,
+    5e-2,
+    1e-1,
+    2e-1,
+    3e-1,
+    5e-1,
+    7e-1,
+]
+
+
+def parse_args() -> argparse.Namespace:
+    parser = argparse.ArgumentParser(
+        description="Wan 2.2 video generation with skip-softmax sparse attention"
+    )
+    parser.add_argument("--prompt", type=str, required=True, help="Text prompt for generation")
+    parser.add_argument("--output", type=str, default="output.mp4", help="Output video path")
+    parser.add_argument(
+        "--model-path", type=str, default=DEFAULT_MODEL_PATH, help="Wan 2.2 model path or HF ID"
+    )
+    parser.add_argument(
+        "--num-frames", type=int, default=81, help="Number of frames (must be 4k+1)"
+    )
+    parser.add_argument("--height", type=int, default=480, help="Video height")
+    parser.add_argument("--width", type=int, default=832, help="Video width")
+    parser.add_argument("--num-steps", type=int, default=50, help="Number of inference steps")
+    parser.add_argument(
+        "--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale"
+    )
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+
+    # Sparse attention options
+    parser.add_argument(
+        "--skip-first-last",
+        type=int,
+        default=0,
+        help="Number of first/last transformer layers to keep dense (default: 0)",
+    )
+
+    # Calibration options
+    parser.add_argument(
+        "--calibrate",
+        action="store_true",
+        help="Calibrate threshold via exponential model (recommended)",
+    )
+    parser.add_argument(
+        "--target-sparsity",
+        type=float,
+        default=0.25,
+        help="Target sparsity ratio for calibration (0.0-1.0)",
+    )
+    parser.add_argument(
+        "--calib-steps",
+        type=int,
+        default=10,
+        help="Inference steps for calibration",
+    )
+    parser.add_argument(
+        "--calib-frames",
+        type=int,
+        default=33,
+        help="Number of frames for calibration (fewer = faster)",
+    )
+    return parser.parse_args()
+
+
+def build_pipeline(model_path: str) -> WanPipeline:
+    """Build the Wan 2.2 text-to-video pipeline."""
+    vae = AutoencoderKLWan.from_pretrained(model_path, subfolder="vae", torch_dtype=torch.float32)
+    pipe = WanPipeline.from_pretrained(model_path, vae=vae, torch_dtype=torch.bfloat16)
+    pipe.to("cuda")
+    return pipe
+
+
+def build_sparse_config(args: argparse.Namespace) -> dict:
+    """Build sparse attention config from CLI args.
+
+    Uses flash_skip_softmax which supports both calibration (eager attention
+    with F.softmax patching) and inference. Calibration fits an exponential
+    model: scale_factor = a * exp(b * sparsity).
+    """
+    attn_cfg: dict = {
+        "method": "flash_skip_softmax",
+        "thresholds": {"prefill": [1e-3]},
+        "br": 128,
+        "bc": 128,
+        "backend": "pytorch",
+        "is_causal": False,  # Diffusion = bidirectional attention
+        "collect_stats": True,
+        "enable": True,
+    }
+
+    sparse_cfg: dict = {
+        "*.attn1*": attn_cfg,  # Self-attention only
+        "*.attn2*": {"enable": False},  # Text cross-attention
+        "default": {"enable": False},
+    }
+
+    # Keep first/last N layers dense for quality
+    for i in range(args.skip_first_last):
+        sparse_cfg[f"*blocks.{i}.attn*"] = {"enable": False}
+        sparse_cfg[f"*blocks.{NUM_TRANSFORMER_BLOCKS - 1 - i}.attn*"] = {"enable": False}
+
+    config: dict = {"sparse_cfg": sparse_cfg}
+
+    # Add calibration config with threshold trials
+    if args.calibrate:
+        sparse_cfg["calibration"] = {
+            "target_sparse_ratio": {"prefill": args.target_sparsity},
+            "samples": 1,
+            "threshold_trials": DEFAULT_THRESHOLD_TRIALS,
+        }
+
+    return config
+
+
+def build_calibration_forward_loop(
+    pipe: WanPipeline,
+    prompt: str,
+    num_steps: int = 10,
+    num_frames: int = 33,
+    height: int = 480,
+    width: int = 832,
+    seed: int = 42,
+):
+    """Build a forward loop for exponential model calibration."""
+
+    def forward_loop(model):
+        print(f"Calibration: generating {num_frames} frames @ {height}x{width}...")
+        pipe(
+            prompt=prompt,
+            num_frames=num_frames,
+            height=height,
+            width=width,
+            num_inference_steps=num_steps,
+            guidance_scale=5.0,
+            generator=torch.Generator(device="cuda").manual_seed(seed),
+        )
+
+    return forward_loop
+
+
+def print_sparsity_summary(model: torch.nn.Module) -> None:
+    """Print per-module sparsity statistics."""
+    enabled, disabled = [], []
+    for name, module in model.named_modules():
+        if isinstance(module, SparseAttentionModule):
+            if module.is_enabled:
+                enabled.append((name, module))
+            else:
+                disabled.append(name)
+
+    print(f"\nSparse attention: {len(enabled)} enabled, {len(disabled)} disabled")
+    for name, module in enabled:
+        info = module.get_threshold_info()
+        print(f"  {name}: {info}")
+
+
+def main() -> None:
+    args = parse_args()
+
+    # ---- Build pipeline ----
+    print(f"Loading Wan 2.2 from {args.model_path}...")
+    pipe = build_pipeline(args.model_path)
+
+    # ---- Get and sparsify the transformer ----
+    transformer = pipe.transformer
+
+    config = build_sparse_config(args)
+    forward_loop = None
+    if args.calibrate:
+        forward_loop = build_calibration_forward_loop(
+            pipe,
+            prompt=args.prompt,
+            num_steps=args.calib_steps,
+            num_frames=args.calib_frames,
+            height=args.height,
+            width=args.width,
+            seed=args.seed,
+        )
+
+    print("Applying skip-softmax sparse attention...")
+    mtsa.sparsify(transformer, config, forward_loop=forward_loop)
+
+    # ---- Generate ----
+    print(f"Generating: {args.prompt[:80]}...")
+    output = pipe(
+        prompt=args.prompt,
+        num_frames=args.num_frames,
+        height=args.height,
+        width=args.width,
+        num_inference_steps=args.num_steps,
+        guidance_scale=args.guidance_scale,
+        generator=torch.Generator(device="cuda").manual_seed(args.seed),
+    )
+
+    export_to_video(output.frames[0], args.output, fps=16)
+    print(f"Saved to {args.output}")
+
+    # ---- Print stats ----
+    print_sparsity_summary(transformer)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py
index dbc4d5bc27..da64e87d64 100644
--- a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py
+++ b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py
@@ -255,11 +255,14 @@ def calibrate_sparse_attention(
 
     print(f"Calibrating {len(sparse_modules)} sparse attention modules together...")
 
-    # Extract tokenizer and build calibration data if needed
-    tokenizer = _extract_tokenizer_from_model(model)
+    # Extract tokenizer and build calibration data only if no forward_loop is provided.
+    # When the user supplies their own forward_loop (e.g. for diffusion models),
+    # RULER dataset generation is skipped entirely.
+    tokenizer = None
     calibration_data = None
 
-    if calibrate_prefill or calibrate_decode:
+    if forward_loop is None and (calibrate_prefill or calibrate_decode):
+        tokenizer = _extract_tokenizer_from_model(model)
         builder = RulerDatasetBuilder(
             samples=calib_config.samples,
             max_seqlen=calib_config.max_seqlen,
@@ -280,11 +283,15 @@ def calibrate_sparse_attention(
         print("PREFILL PHASE CALIBRATION")
         print("=" * 60)
 
-        if calibration_data is None:
+        if forward_loop is None and calibration_data is None:
             raise RuntimeError("calibration_data must be built before prefill")
-        prefill_forward_loop = forward_loop or create_calibration_forward_loop(
-            calibration_data, tokenizer, chunk_size=calib_config.chunk_size
-        )
+        if forward_loop is not None:
+            prefill_forward_loop = forward_loop
+        else:
+            assert calibration_data is not None and tokenizer is not None
+            prefill_forward_loop = create_calibration_forward_loop(
+                calibration_data, tokenizer, chunk_size=calib_config.chunk_size
+            )
 
         prefill_calibrator = DynamicThresholdCalibrator(
             threshold_trials=calib_config.threshold_trials,
@@ -302,8 +309,8 @@ def calibrate_sparse_attention(
         print("DECODE PHASE CALIBRATION")
         print("=" * 60)
 
-        if calibration_data is None:
-            raise RuntimeError("calibration_data must be built before decode")
+        if calibration_data is None or tokenizer is None:
+            raise RuntimeError("calibration_data and tokenizer must be built before decode")
         decode_forward_loop = create_decode_calibration_forward_loop(
             calibration_data, tokenizer, num_decode_tokens=calib_config.num_decode_tokens
         )
diff --git a/modelopt/torch/sparsity/attention_sparsity/conversion.py b/modelopt/torch/sparsity/attention_sparsity/conversion.py
index cdc2aed948..8bd6a9b96a 100644
--- a/modelopt/torch/sparsity/attention_sparsity/conversion.py
+++ b/modelopt/torch/sparsity/attention_sparsity/conversion.py
@@ -101,6 +101,42 @@ def is_attn_sparsified(model: nn.Module) -> bool:
     return any(isinstance(module, SparseAttentionModule) for module in model.modules())
 
 
+def _register_diffusers_backends_if_needed(model: nn.Module) -> None:
+    """Register diffusers/LTX attention backends if the model needs them.
+
+    Called before plugin registration so that the backends are available
+    when ``SparseAttentionModule.forward()`` activates the skip-softmax context.
+    """
+    # Register the diffusers eager and Triton backends if the model is a diffusers ModelMixin
+    try:
+        from diffusers.models.modeling_utils import ModelMixin
+
+        if isinstance(model, ModelMixin):
+            from .kernels import (
+                register_diffusers_eager_attention,
+                register_diffusers_triton_attention,
+            )
+
+            if register_diffusers_eager_attention is not None:
+                register_diffusers_eager_attention()
+            if register_diffusers_triton_attention is not None:
+                register_diffusers_triton_attention()
+    except ImportError:
+        pass
+
+    # Patch ltx_core Attention modules if present (independent of diffusers)
+    import contextlib
+
+    from .kernels import register_ltx_eager_attention, register_ltx_triton_attention
+
+    if register_ltx_eager_attention is not None:
+        with contextlib.suppress(Exception):
+            register_ltx_eager_attention(model)
+    if register_ltx_triton_attention is not None:
+        with contextlib.suppress(Exception):
+            register_ltx_triton_attention(model)
+
+
 def convert_to_sparse_attention_model(
     model: ModelLikeModule, config: SparseAttentionConfig
 ) -> ConvertReturnType:
@@ -116,6 +152,9 @@ def convert_to_sparse_attention_model(
     # Initialize the true module if necessary
     model = model.init_modellike() if isinstance(model, ModelLikeModule) else model
 
+    # Register diffusers backends for diffusion models
+    _register_diffusers_backends_if_needed(model)
+
     # Set the correct attn_implementation for the chosen backend
     _set_attn_implementation(model, config)
 
@@ -346,32 +385,46 @@ def export_sparse_attention_config(model: nn.Module) -> dict[str, Any] | None:
     if calibration_params is None:
         return None
 
-    # Build threshold_scale_factor with model parameters
-    threshold_scale_factor: dict[str, Any] = {
-        "formula": "a * exp(b * target_sparsity)",
-    }
-    for phase in ["prefill", "decode"]:
-        if phase in calibration_params:
-            threshold_scale_factor[phase] = {
-                "a": calibration_params[phase]["a"],
-                "b": calibration_params[phase]["b"],
-            }
+    # Detect calibration type from params
+    sample_params = next(iter(calibration_params.values()))
+    is_percentile = "threshold" in sample_params
 
     # Build the export config
     export_config: dict[str, Any] = {
         "config_groups": {
             "group_0": {
-                "sparse_algo": "softmax_skip",
+                "sparse_algo": "softmax_skip_diffusion" if is_percentile else "softmax_skip",
                 "targets": sorted(target_classes) if target_classes else ["Attention"],
             }
         },
-        "threshold_scale_factor": threshold_scale_factor,
         "producer": {
             "name": "modelopt",
             "version": mo_version,
         },
     }
 
+    if is_percentile:
+        threshold_config: dict[str, Any] = {
+            "formula": "skip if gap >= threshold * log(seq_k)",
+        }
+        for phase in ["prefill", "decode"]:
+            if phase in calibration_params:
+                threshold_config[phase] = {
+                    "threshold": calibration_params[phase]["threshold"],
+                }
+        export_config["threshold_config"] = threshold_config
+    else:
+        threshold_scale_factor: dict[str, Any] = {
+            "formula": "a * exp(b * target_sparsity)",
+        }
+        for phase in ["prefill", "decode"]:
+            if phase in calibration_params:
+                threshold_scale_factor[phase] = {
+                    "a": calibration_params[phase]["a"],
+                    "b": calibration_params[phase]["b"],
+                }
+        export_config["threshold_scale_factor"] = threshold_scale_factor
+
     return export_config
 
 
@@ -443,6 +496,16 @@ def _format_threshold(info: dict) -> str:
                 s = target.get(phase, 0.5)
                 parts.append(f"{phase}: a={a:.4f}, b={b:.2f}, target={s:.0%}")
         return f"calibrated({', '.join(parts)})"
+    if t == "dynamic_calibrated_percentile":
+        params = info.get("calibration_params", {})
+        target = info.get("target_sparse_ratio", {})
+        parts = []
+        for phase in ["prefill", "decode"]:
+            if phase in params and "threshold" in params[phase]:
+                th = params[phase]["threshold"]
+                s = target.get(phase, 0.2)
+                parts.append(f"{phase}: threshold={th:.4f}, target={s:.0%}")
+        return f"percentile({', '.join(parts)})"
     if t == "static":
         v = info.get("value")
         if isinstance(v, dict):
@@ -470,6 +533,8 @@ def print_sparse_attention_summary(model: nn.Module):
     # Group by (method, threshold)
     groups: dict[tuple[str, str], int] = {}
     for _, module in sparse_modules:
+        if not module.is_enabled:
+            continue
         method = getattr(module, "_method", "unknown")
         threshold = _format_threshold(module.get_threshold_info())
         groups[(method, threshold)] = groups.get((method, threshold), 0) + 1
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py b/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py
index dee1bc472a..bb7e921a13 100644
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py
@@ -13,12 +13,60 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Re-exports from modelopt.torch.kernels for backward compatibility."""
+"""Kernel integrations for sparse attention: Triton FA and diffusers backends."""
 
+import contextlib
+import threading
+
+# ---------------------------------------------------------------------------
+# Triton FA kernel re-exports (for HuggingFace LLM integration)
+# ---------------------------------------------------------------------------
 from modelopt.torch.kernels import IS_AVAILABLE, attention, register_triton_attention
 
+# ---------------------------------------------------------------------------
+# Thread-local context: shared by diffusers eager and Triton backends
+# ---------------------------------------------------------------------------
+_thread_local = threading.local()
+
+
+def set_skip_softmax_context(active: bool) -> None:
+    """Set thread-local flag indicating skip-softmax eager attention is active."""
+    _thread_local.skip_softmax_active = active
+
+
+def get_skip_softmax_context() -> bool:
+    """Return True if skip-softmax eager attention is active in this thread."""
+    return getattr(_thread_local, "skip_softmax_active", False)
+
+
+# ---------------------------------------------------------------------------
+# Optional backend registrations (depend on diffusers / ltx_core)
+# ---------------------------------------------------------------------------
+register_diffusers_eager_attention = None
+register_diffusers_triton_attention = None
+register_ltx_eager_attention = None
+register_ltx_triton_attention = None
+
+with contextlib.suppress(ImportError):
+    from .diffusers_eager_attention import register_diffusers_eager_attention
+
+with contextlib.suppress(ImportError):
+    from .diffusers_triton_attention import register_diffusers_triton_attention
+
+with contextlib.suppress(ImportError):
+    from .ltx_eager_attention import register_ltx_eager_attention
+
+with contextlib.suppress(ImportError):
+    from .ltx_triton_attention import register_ltx_triton_attention
+
 __all__ = [
     "IS_AVAILABLE",
     "attention",
+    "get_skip_softmax_context",
+    "register_diffusers_eager_attention",
+    "register_diffusers_triton_attention",
+    "register_ltx_eager_attention",
+    "register_ltx_triton_attention",
     "register_triton_attention",
+    "set_skip_softmax_context",
 ]
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_eager_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_eager_attention.py
new file mode 100644
index 0000000000..16dd895f27
--- /dev/null
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_eager_attention.py
@@ -0,0 +1,147 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Eager attention backend for diffusers skip-softmax sparse attention.
+
+Registers a ``modelopt_skip_softmax`` backend in diffusers'
+``_AttentionBackendRegistry`` that computes attention eagerly with an explicit
+``F.softmax`` call.  This allows the existing softmax-patching mechanism in
+``SparseAttentionModule`` to intercept and apply block-wise sparsity.
+
+Used during **calibration only** — inference uses the Triton FA kernel.
+"""
+
+import inspect
+import math
+
+import torch
+import torch.nn.functional as F
+from diffusers.models.attention_dispatch import (
+    AttentionBackendName,
+    _AttentionBackendRegistry,
+    attention_backend,
+)
+
+_BACKEND_NAME = "modelopt_skip_softmax"
+_BACKEND_REGISTERED = False
+
+
+# ---------------------------------------------------------------------------
+# Eager attention implementation
+# ---------------------------------------------------------------------------
+
+
+def _diffusers_eager_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: torch.Tensor | None = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: float | None = None,
+    enable_gqa: bool = False,
+) -> torch.Tensor:
+    """Compute attention eagerly on diffusers layout ``[B, S, H, D]``.
+
+    The explicit ``F.softmax`` call is what the skip-softmax patch intercepts.
+    """
+    # Diffusers convention: [B, S, H, D] → transpose to [B, H, S, D]
+    query = query.transpose(1, 2)
+    key = key.transpose(1, 2)
+    value = value.transpose(1, 2)
+
+    # Handle GQA: repeat K/V heads to match Q heads
+    if enable_gqa and query.shape[1] != key.shape[1]:
+        num_heads_q = query.shape[1]
+        num_heads_kv = key.shape[1]
+        n_rep = num_heads_q // num_heads_kv
+        key = key.repeat_interleave(n_rep, dim=1)
+        value = value.repeat_interleave(n_rep, dim=1)
+
+    if scale is None:
+        scale = 1.0 / math.sqrt(query.shape[-1])
+
+    # Q @ K^T * scale
+    scores = torch.matmul(query, key.transpose(-2, -1)) * scale
+
+    # Apply attention mask if provided
+    if attn_mask is not None:
+        scores = scores + attn_mask
+
+    # Apply causal mask if needed
+    if is_causal:
+        seq_q, seq_k = scores.shape[-2], scores.shape[-1]
+        causal_mask = torch.triu(
+            torch.full((seq_q, seq_k), float("-inf"), device=scores.device, dtype=scores.dtype),
+            diagonal=seq_k - seq_q + 1,
+        )
+        scores = scores + causal_mask
+
+    # F.softmax — this is where the skip-softmax patch intercepts
+    scores = F.softmax(scores, dim=-1)
+
+    if dropout_p > 0.0:
+        scores = F.dropout(scores, p=dropout_p, training=True)
+
+    # scores @ V
+    out = torch.matmul(scores, value)
+
+    # Transpose back: [B, H, S, D] → [B, S, H, D]
+    out = out.transpose(1, 2)
+    return out
+
+
+# ---------------------------------------------------------------------------
+# Registration
+# ---------------------------------------------------------------------------
+
+
+def register_diffusers_eager_attention() -> None:
+    """Register ``modelopt_skip_softmax`` backend in diffusers.
+
+    Safe to call multiple times; registration happens only once.
+    """
+    global _BACKEND_REGISTERED
+    if _BACKEND_REGISTERED:
+        return
+
+    # Extend the AttentionBackendName enum with our custom value
+    new_member = str.__new__(AttentionBackendName, _BACKEND_NAME)
+    new_member._name_ = "MODELOPT_SKIP_SOFTMAX"
+    new_member._value_ = _BACKEND_NAME
+    AttentionBackendName._member_map_["MODELOPT_SKIP_SOFTMAX"] = new_member
+    AttentionBackendName._value2member_map_[_BACKEND_NAME] = new_member
+
+    # Register the backend function
+    _AttentionBackendRegistry._backends[new_member] = _diffusers_eager_attention
+    _AttentionBackendRegistry._constraints[new_member] = []
+    _AttentionBackendRegistry._supported_arg_names[new_member] = set(
+        inspect.signature(_diffusers_eager_attention).parameters.keys()
+    )
+
+    _BACKEND_REGISTERED = True
+
+
+def get_skip_softmax_attention_backend():
+    """Return a context manager that activates the modelopt_skip_softmax backend.
+
+    Raises RuntimeError if the backend has not been registered yet.
+    """
+    if not _BACKEND_REGISTERED:
+        raise RuntimeError(
+            "modelopt_skip_softmax backend not registered. "
+            "Call register_diffusers_eager_attention() first."
+        )
+    return attention_backend(_BACKEND_NAME)
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
new file mode 100644
index 0000000000..91131f3205
--- /dev/null
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
@@ -0,0 +1,160 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Triton flash attention backend for diffusers models.
+
+Registers a ``modelopt_triton`` backend in diffusers' ``_AttentionBackendRegistry``
+that converts the diffusers [B, S, H, D] layout to the Triton FA kernel's varlen
+[total_tokens, H, D] format. Supports skip-softmax tile skipping for sparse attention.
+
+Used during **inference** -- calibration uses the eager backend instead.
+"""
+
+import inspect
+import math
+import threading
+
+import torch
+from diffusers.models.attention_dispatch import (
+    AttentionBackendName,
+    _AttentionBackendRegistry,
+    attention_backend,
+)
+
+from modelopt.torch.kernels.triton_fa import attention
+
+_BACKEND_NAME = "modelopt_triton"
+_BACKEND_REGISTERED = False
+
+# Thread-local storage for per-forward skip-softmax configuration.
+# The method's get_sparse_context() sets these before each forward pass.
+_thread_local = threading.local()
+
+
+def set_triton_skip_softmax_config(threshold: float | None = None) -> None:
+    """Set thread-local skip-softmax config for the next Triton attention call."""
+    _thread_local.skip_threshold = threshold
+
+
+def clear_triton_skip_softmax_config() -> None:
+    """Clear thread-local skip-softmax config."""
+    _thread_local.skip_threshold = None
+
+
+# ---------------------------------------------------------------------------
+# Triton attention implementation for diffusers layout
+# ---------------------------------------------------------------------------
+
+
+def _diffusers_triton_attention(
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attn_mask: torch.Tensor | None = None,
+    dropout_p: float = 0.0,
+    is_causal: bool = False,
+    scale: float | None = None,
+    enable_gqa: bool = False,
+) -> torch.Tensor:
+    """Compute attention via Triton FA kernel on diffusers layout ``[B, S, H, D]``.
+
+    Converts to the kernel's varlen format, calls the Triton FA kernel, and
+    converts back.
+    """
+    batch, seq_q, num_heads_q, head_dim = query.shape
+    seq_k = key.shape[1]
+    device = query.device
+
+    # Reshape from diffusers [B, S, H, D] -> flat [B*S, H, D]
+    q = query.reshape(batch * seq_q, num_heads_q, head_dim).contiguous()
+    k = key.reshape(batch * seq_k, key.shape[2], head_dim).contiguous()
+    v = value.reshape(batch * seq_k, value.shape[2], head_dim).contiguous()
+
+    # Build varlen metadata
+    b_start_loc_q = torch.arange(batch, device=device, dtype=torch.int32) * seq_q
+    b_seq_len_q = torch.full((batch,), seq_q, device=device, dtype=torch.int32)
+
+    if scale is None:
+        scale = 1.0 / math.sqrt(head_dim)
+
+    kw: dict = {
+        "b_start_loc": b_start_loc_q,
+        "b_seq_len": b_seq_len_q,
+        "max_input_len": seq_q,
+        "is_causal": is_causal,
+        "softmax_scale": scale,
+    }
+
+    # If Q and KV have different sequence lengths, pass separate KV metadata
+    if seq_q != seq_k:
+        b_start_loc_k = torch.arange(batch, device=device, dtype=torch.int32) * seq_k
+        b_seq_len_k = torch.full((batch,), seq_k, device=device, dtype=torch.int32)
+        kw["b_start_loc_k"] = b_start_loc_k
+        kw["b_seq_len_k"] = b_seq_len_k
+        kw["max_input_len_k"] = seq_k
+
+    # Read skip-softmax config from thread-local storage
+    threshold = getattr(_thread_local, "skip_threshold", None)
+    if threshold is not None and threshold > 0.0:
+        kw["skip_softmax_threshold"] = threshold
+
+    o = attention(q, k, v, **kw)
+
+    # Reshape back: [B*S, H, D] -> [B, S, H, D]
+    return o.view(batch, seq_q, num_heads_q, head_dim)
+
+
+# ---------------------------------------------------------------------------
+# Registration
+# ---------------------------------------------------------------------------
+
+
+def register_diffusers_triton_attention() -> None:
+    """Register ``modelopt_triton`` backend in diffusers.
+
+    Safe to call multiple times; registration happens only once.
+    """
+    global _BACKEND_REGISTERED
+    if _BACKEND_REGISTERED:
+        return
+
+    # Extend the AttentionBackendName enum with our custom value
+    new_member = str.__new__(AttentionBackendName, _BACKEND_NAME)
+    new_member._name_ = "MODELOPT_TRITON"
+    new_member._value_ = _BACKEND_NAME
+    AttentionBackendName._member_map_["MODELOPT_TRITON"] = new_member
+    AttentionBackendName._value2member_map_[_BACKEND_NAME] = new_member
+
+    # Register the backend function
+    _AttentionBackendRegistry._backends[new_member] = _diffusers_triton_attention
+    _AttentionBackendRegistry._constraints[new_member] = []
+    _AttentionBackendRegistry._supported_arg_names[new_member] = set(
+        inspect.signature(_diffusers_triton_attention).parameters.keys()
+    )
+
+    _BACKEND_REGISTERED = True
+
+
+def get_triton_attention_backend():
+    """Return a context manager that activates the modelopt_triton backend.
+
+    Raises RuntimeError if the backend has not been registered yet.
+    """
+    if not _BACKEND_REGISTERED:
+        raise RuntimeError(
+            "modelopt_triton backend not registered. "
+            "Call register_diffusers_triton_attention() first."
+        )
+    return attention_backend(_BACKEND_NAME)
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_eager_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_eager_attention.py
new file mode 100644
index 0000000000..6c082ee588
--- /dev/null
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_eager_attention.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Eager attention wrapper for LTX-2 (ltx_core) skip-softmax sparse attention.
+
+Patches ``Attention`` modules from ``ltx_core`` so that when the skip-softmax
+thread-local flag is active, attention is computed eagerly with an explicit
+``F.softmax`` call that the softmax-patching mechanism can intercept.
+
+Used during **calibration only** — inference uses the Triton FA kernel via
+the diffusers Triton backend.
+"""
+
+import math
+
+import torch
+import torch.nn.functional as F
+from ltx_core.model.transformer.attention import Attention
+
+from . import get_skip_softmax_context
+
+
+def _ltx_eager_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    heads: int,
+    mask: torch.Tensor | None = None,
+) -> torch.Tensor:
+    """Eager attention on LTX-2 layout ``[B, T, H*D]``.
+
+    Mirrors the ``PytorchAttention`` class in ltx_core but uses an explicit
+    ``F.softmax`` instead of ``scaled_dot_product_attention``.
+    """
+    b, _, dim_total = q.shape
+    dim_head = dim_total // heads
+
+    # Reshape to [B, T, H, D] then transpose to [B, H, T, D]
+    q = q.view(b, -1, heads, dim_head).transpose(1, 2)
+    k = k.view(b, -1, heads, dim_head).transpose(1, 2)
+    v = v.view(b, -1, heads, dim_head).transpose(1, 2)
+
+    scale = 1.0 / math.sqrt(dim_head)
+
+    # Q @ K^T * scale
+    scores = torch.matmul(q, k.transpose(-2, -1)) * scale
+
+    # Apply mask if provided
+    if mask is not None:
+        # Expand mask dimensions to match scores [B, H, Sq, Sk]
+        if mask.ndim == 2:
+            mask = mask.unsqueeze(0)
+        if mask.ndim == 3:
+            mask = mask.unsqueeze(1)
+        scores = scores + mask
+
+    # F.softmax — intercepted by skip-softmax patch
+    scores = F.softmax(scores, dim=-1)
+
+    # scores @ V
+    out = torch.matmul(scores, v)
+
+    # [B, H, T, D] → [B, T, H*D]
+    out = out.transpose(1, 2).reshape(b, -1, heads * dim_head)
+    return out
+
+
+class _SkipSoftmaxLTXAttentionWrapper:
+    """Wraps an ``attention_function`` callable from ltx_core.
+
+    When the thread-local skip-softmax flag is active, routes to the eager
+    attention path.  Otherwise calls the original function.
+    """
+
+    def __init__(self, original_fn):
+        self._original_fn = original_fn
+
+    def __call__(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        heads: int,
+        mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        if get_skip_softmax_context():
+            return _ltx_eager_attention(q, k, v, heads, mask)
+        return self._original_fn(q, k, v, heads, mask)
+
+
+def register_ltx_eager_attention(model: torch.nn.Module) -> None:
+    """Walk *model* and patch all ``ltx_core.model.transformer.attention.Attention`` modules.
+
+    Patches modules so their ``attention_function`` is routed through the eager wrapper.
+    Safe to call multiple times on the same model — already-wrapped modules are
+    skipped.
+    """
+    for module in model.modules():
+        if isinstance(module, Attention):
+            fn = module.attention_function
+            if not isinstance(fn, _SkipSoftmaxLTXAttentionWrapper):
+                module.attention_function = _SkipSoftmaxLTXAttentionWrapper(fn)
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
new file mode 100644
index 0000000000..1493fe8a0f
--- /dev/null
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
@@ -0,0 +1,148 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Triton flash attention wrapper for LTX-2 (ltx_core) skip-softmax sparse attention.
+
+Patches ``Attention`` modules from ``ltx_core`` so that when the Triton
+skip-softmax flag is active, attention is computed via the Triton FA kernel
+with fused tile skipping.
+
+Used during **inference** -- calibration uses the eager wrapper instead.
+"""
+
+import math
+import threading
+
+import torch
+from ltx_core.model.transformer.attention import Attention
+
+from modelopt.torch.kernels.triton_fa import attention
+
+# Thread-local storage for skip-softmax configuration
+_thread_local = threading.local()
+
+
+def set_ltx_triton_context(
+    active: bool,
+    threshold: float | None = None,
+) -> None:
+    """Set thread-local Triton config for LTX-2 attention."""
+    _thread_local.active = active
+    _thread_local.threshold = threshold
+
+
+def clear_ltx_triton_context() -> None:
+    """Clear thread-local Triton config."""
+    _thread_local.active = False
+    _thread_local.threshold = None
+
+
+def _get_ltx_triton_context() -> tuple[bool, float | None]:
+    """Return (active, threshold)."""
+    return (
+        getattr(_thread_local, "active", False),
+        getattr(_thread_local, "threshold", None),
+    )
+
+
+def _ltx_triton_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    heads: int,
+    mask: torch.Tensor | None = None,
+    threshold: float | None = None,
+) -> torch.Tensor:
+    """Triton FA attention on LTX-2 layout ``[B, T, H*D]``.
+
+    Converts from LTX-2's fused-head layout to the Triton kernel's varlen
+    format, calls the kernel with skip-softmax, and converts back.
+    """
+    b, seq_q, dim_total = q.shape
+    dim_head = dim_total // heads
+    seq_k = k.shape[1]
+    device = q.device
+
+    # LTX-2 layout: [B, T, H*D] -> reshape to [B, T, H, D] -> flat [B*T, H, D]
+    q_flat = q.view(b, seq_q, heads, dim_head).reshape(b * seq_q, heads, dim_head).contiguous()
+    k_flat = k.view(b, seq_k, heads, dim_head).reshape(b * seq_k, heads, dim_head).contiguous()
+    v_flat = v.view(b, seq_k, heads, dim_head).reshape(b * seq_k, heads, dim_head).contiguous()
+
+    # Build varlen metadata
+    b_start_loc_q = torch.arange(b, device=device, dtype=torch.int32) * seq_q
+    b_seq_len_q = torch.full((b,), seq_q, device=device, dtype=torch.int32)
+
+    scale = 1.0 / math.sqrt(dim_head)
+
+    kw: dict = {
+        "b_start_loc": b_start_loc_q,
+        "b_seq_len": b_seq_len_q,
+        "max_input_len": seq_q,
+        "is_causal": False,  # Diffusion uses bidirectional attention
+        "softmax_scale": scale,
+    }
+
+    # Handle different Q/KV sequence lengths
+    if seq_q != seq_k:
+        b_start_loc_k = torch.arange(b, device=device, dtype=torch.int32) * seq_k
+        b_seq_len_k = torch.full((b,), seq_k, device=device, dtype=torch.int32)
+        kw["b_start_loc_k"] = b_start_loc_k
+        kw["b_seq_len_k"] = b_seq_len_k
+        kw["max_input_len_k"] = seq_k
+
+    # Skip-softmax threshold
+    if threshold is not None and threshold > 0.0:
+        kw["skip_softmax_threshold"] = threshold
+
+    o = attention(q_flat, k_flat, v_flat, **kw)
+
+    # Reshape back: [B*T, H, D] -> [B, T, H*D]
+    return o.view(b, seq_q, heads * dim_head)
+
+
+class _TritonLTXAttentionWrapper:
+    """Wraps an ``attention_function`` callable from ltx_core.
+
+    When the thread-local Triton skip-softmax flag is active, routes to the
+    Triton FA kernel.  Otherwise calls the original function.
+    """
+
+    def __init__(self, original_fn):
+        self._original_fn = original_fn
+
+    def __call__(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        heads: int,
+        mask: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        active, threshold = _get_ltx_triton_context()
+        if active:
+            return _ltx_triton_attention(q, k, v, heads, mask, threshold)
+        return self._original_fn(q, k, v, heads, mask)
+
+
+def register_ltx_triton_attention(model: torch.nn.Module) -> None:
+    """Walk *model* and patch all ``ltx_core.Attention`` modules for Triton dispatch.
+
+    Safe to call multiple times -- already-wrapped modules are skipped.
+    """
+    for module in model.modules():
+        if isinstance(module, Attention):
+            fn = module.attention_function
+            if not isinstance(fn, _TritonLTXAttentionWrapper):
+                module.attention_function = _TritonLTXAttentionWrapper(fn)
diff --git a/modelopt/torch/sparsity/attention_sparsity/methods/flash_skip_softmax.py b/modelopt/torch/sparsity/attention_sparsity/methods/flash_skip_softmax.py
index 2501b58f65..aab399292a 100644
--- a/modelopt/torch/sparsity/attention_sparsity/methods/flash_skip_softmax.py
+++ b/modelopt/torch/sparsity/attention_sparsity/methods/flash_skip_softmax.py
@@ -20,6 +20,7 @@
 """
 
 import math
+from contextlib import ExitStack
 from typing import Any
 
 import numpy as np
@@ -369,7 +370,11 @@ def get_threshold_info(self) -> dict[str, Any]:
             }
 
     def get_sparse_context(self, module: torch.nn.Module):
-        """Return a context manager that patches F.softmax with sparse masking."""
+        """Return a context manager that patches F.softmax with sparse masking.
+
+        Also registers the diffusers eager backend so that diffusion models
+        (which don't call F.softmax directly) route through the patched path.
+        """
         original_softmax = F.softmax
 
         def sparse_softmax(input, dim=-1, *args, **kwargs):
@@ -379,7 +384,21 @@ def sparse_softmax(input, dim=-1, *args, **kwargs):
                 input = self.apply_sparsity(input, sparse_mask)
             return original_softmax(input, dim, *args, **kwargs)
 
-        return replace_function(torch.nn.functional, "softmax", sparse_softmax)
+        from ..kernels import set_skip_softmax_context
+
+        stack = ExitStack()
+        set_skip_softmax_context(True)
+        stack.callback(set_skip_softmax_context, False)
+
+        try:
+            from ..kernels.diffusers_eager_attention import get_skip_softmax_attention_backend
+
+            stack.enter_context(get_skip_softmax_attention_backend())
+        except (ImportError, RuntimeError):
+            pass
+
+        stack.enter_context(replace_function(torch.nn.functional, "softmax", sparse_softmax))
+        return stack
 
     @property
     def name(self) -> str:
diff --git a/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py b/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
index 4db51e894e..b885eeaea5 100644
--- a/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
+++ b/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
@@ -17,6 +17,8 @@
 
 from contextlib import contextmanager
 
+import torch
+
 from .registry import SparseAttentionMethod, register_sparse_method
 
 
@@ -45,6 +47,18 @@ def name(self) -> str:
         """Method name identifier."""
         return "triton_skip_softmax"
 
+    def calculate_sparsity(self, attention_scores):
+        """Return a no-op mask (skip decision is made inside the Triton kernel)."""
+        mask = torch.ones_like(attention_scores, dtype=torch.bool)
+        return mask, {}
+
+    def apply_sparsity(self, attention_scores, sparse_mask=None):
+        """Not supported — tile skipping is fused into the Triton kernel."""
+        raise NotImplementedError(
+            "triton_skip_softmax applies tile skipping inside the Triton kernel. "
+            "Use backend='triton', not backend='pytorch'."
+        )
+
     def get_sparse_context(self, module):
         """Return context manager that activates skip-softmax during forward."""
 
diff --git a/modelopt/torch/sparsity/attention_sparsity/plugins/huggingface.py b/modelopt/torch/sparsity/attention_sparsity/plugins/huggingface.py
index 599832943d..988d48418b 100644
--- a/modelopt/torch/sparsity/attention_sparsity/plugins/huggingface.py
+++ b/modelopt/torch/sparsity/attention_sparsity/plugins/huggingface.py
@@ -117,6 +117,15 @@ def _is_supported_model(model: nn.Module) -> bool:
     except ImportError:
         pass
 
+    # Check for diffusers ModelMixin
+    try:
+        from diffusers.models.modeling_utils import ModelMixin
+
+        if isinstance(model, ModelMixin):
+            return True
+    except ImportError:
+        pass
+
     # Support any PyTorch model with attention modules
     return isinstance(model, nn.Module)
 
diff --git a/modelopt/torch/sparsity/attention_sparsity/stats_manager.py b/modelopt/torch/sparsity/attention_sparsity/stats_manager.py
index 1eabdfe358..de70c3cadf 100644
--- a/modelopt/torch/sparsity/attention_sparsity/stats_manager.py
+++ b/modelopt/torch/sparsity/attention_sparsity/stats_manager.py
@@ -79,14 +79,15 @@ def collect(self, stats: dict):
 
         # In calibration mode, store per-sample stats
         if self.calibration_mode:
-            self.per_sample_stats.append(
-                {
-                    "module": self.module_name,
-                    "sparsity": stats.get("sparsity", 0.0),
-                    "sample_length": stats.get("sample_length", 0),
-                    "phase": phase,
-                }
-            )
+            sample_stat = {
+                "module": self.module_name,
+                "sparsity": stats.get("sparsity", 0.0),
+                "sample_length": stats.get("sample_length", 0),
+                "phase": phase,
+            }
+            if "normalized_gaps" in stats:
+                sample_stat["normalized_gaps"] = stats["normalized_gaps"]
+            self.per_sample_stats.append(sample_stat)
 
     def get_summary(self) -> dict:
         """Get aggregated statistics summary.

From 1f8f0d3f6fb9ab8bd2d61cb88d1cc6d040fc2d02 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Thu, 2 Apr 2026 07:54:06 +0000
Subject: [PATCH 02/21] Add test case

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 .../test_kernel_backends.py                   | 528 ++++++++++++++++++
 1 file changed, 528 insertions(+)
 create mode 100644 tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py

diff --git a/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py b/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
new file mode 100644
index 0000000000..b6f4119eb5
--- /dev/null
+++ b/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
@@ -0,0 +1,528 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Unit tests for diffusers/LTX kernel backends with mocked dependencies.
+
+These tests verify the attention computation logic and registration without
+requiring diffusers or ltx_core to be installed.
+"""
+
+import importlib
+import sys
+import types
+from unittest.mock import MagicMock, patch
+
+import pytest
+import torch
+import torch.nn as nn
+
+# ---------------------------------------------------------------------------
+# Helpers: mock diffusers and ltx_core modules before importing backends
+# ---------------------------------------------------------------------------
+
+
+def _make_mock_diffusers():
+    """Create a mock diffusers module hierarchy for attention_dispatch."""
+    mock_diffusers = types.ModuleType("diffusers")
+    mock_models = types.ModuleType("diffusers.models")
+    mock_attention_dispatch = types.ModuleType("diffusers.models.attention_dispatch")
+
+    # Create a real-ish AttentionBackendName enum mock
+    class FakeAttentionBackendName(str):
+        _member_map_ = {}
+        _value2member_map_ = {}
+
+    mock_attention_dispatch.AttentionBackendName = FakeAttentionBackendName
+
+    class FakeRegistry:
+        _backends = {}
+        _constraints = {}
+        _supported_arg_names = {}
+
+    mock_attention_dispatch._AttentionBackendRegistry = FakeRegistry
+    mock_attention_dispatch.attention_backend = MagicMock()
+
+    mock_diffusers.models = mock_models
+    mock_models.attention_dispatch = mock_attention_dispatch
+
+    return {
+        "diffusers": mock_diffusers,
+        "diffusers.models": mock_models,
+        "diffusers.models.attention_dispatch": mock_attention_dispatch,
+    }
+
+
+def _make_mock_ltx_core():
+    """Create a mock ltx_core module hierarchy."""
+    mock_ltx = types.ModuleType("ltx_core")
+    mock_model = types.ModuleType("ltx_core.model")
+    mock_transformer = types.ModuleType("ltx_core.model.transformer")
+    mock_attn_mod = types.ModuleType("ltx_core.model.transformer.attention")
+
+    class FakeAttention(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.attention_function = lambda q, k, v, heads, mask=None: q
+
+    mock_attn_mod.Attention = FakeAttention
+
+    mock_ltx.model = mock_model
+    mock_model.transformer = mock_transformer
+    mock_transformer.attention = mock_attn_mod
+
+    return {
+        "ltx_core": mock_ltx,
+        "ltx_core.model": mock_model,
+        "ltx_core.model.transformer": mock_transformer,
+        "ltx_core.model.transformer.attention": mock_attn_mod,
+    }
+
+
+# ---------------------------------------------------------------------------
+# Tests: kernels/__init__.py thread-local context
+# ---------------------------------------------------------------------------
+
+
+class TestSkipSoftmaxContext:
+    """Test thread-local skip-softmax context in kernels/__init__.py."""
+
+    def test_default_is_false(self):
+        from modelopt.torch.sparsity.attention_sparsity.kernels import get_skip_softmax_context
+
+        assert get_skip_softmax_context() is False
+
+    def test_set_and_get(self):
+        from modelopt.torch.sparsity.attention_sparsity.kernels import (
+            get_skip_softmax_context,
+            set_skip_softmax_context,
+        )
+
+        set_skip_softmax_context(True)
+        assert get_skip_softmax_context() is True
+        set_skip_softmax_context(False)
+        assert get_skip_softmax_context() is False
+
+
+# ---------------------------------------------------------------------------
+# Tests: diffusers eager attention
+# ---------------------------------------------------------------------------
+
+
+class TestDiffusersEagerAttention:
+    """Test diffusers eager attention backend with mocked diffusers imports."""
+
+    @pytest.fixture(autouse=True)
+    def _setup_mocks(self):
+        """Inject mock diffusers modules and reimport the backend."""
+        mocks = _make_mock_diffusers()
+        mod_name = "modelopt.torch.sparsity.attention_sparsity.kernels.diffusers_eager_attention"
+        # Remove cached module so reimport picks up mocks
+        sys.modules.pop(mod_name, None)
+        with patch.dict(sys.modules, mocks):
+            self.mod = importlib.import_module(mod_name)
+            yield
+        sys.modules.pop(mod_name, None)
+
+    def test_eager_attention_basic(self):
+        """Eager attention produces correct output shape [B, S, H, D]."""
+        b, s, h, d = 2, 8, 4, 16
+        q = torch.randn(b, s, h, d)
+        k = torch.randn(b, s, h, d)
+        v = torch.randn(b, s, h, d)
+
+        out = self.mod._diffusers_eager_attention(q, k, v)
+        assert out.shape == (b, s, h, d)
+
+    def test_eager_attention_cross_attention(self):
+        """Eager attention handles different Q/KV sequence lengths."""
+        b, sq, sk, h, d = 1, 4, 12, 2, 8
+        q = torch.randn(b, sq, h, d)
+        k = torch.randn(b, sk, h, d)
+        v = torch.randn(b, sk, h, d)
+
+        out = self.mod._diffusers_eager_attention(q, k, v)
+        assert out.shape == (b, sq, h, d)
+
+    def test_eager_attention_with_causal_mask(self):
+        """Causal mask produces lower-triangular attention pattern."""
+        b, s, h, d = 1, 4, 1, 8
+        q = torch.randn(b, s, h, d)
+        k = torch.randn(b, s, h, d)
+        v = torch.eye(s).unsqueeze(0).unsqueeze(2).expand(b, s, h, s)
+        # With identity V and causal, output should reflect causal structure
+        out = self.mod._diffusers_eager_attention(q, k, v, is_causal=True)
+        assert out.shape == (b, s, h, s)
+
+    def test_eager_attention_with_mask(self):
+        """Attention mask is applied correctly."""
+        b, s, h, d = 1, 4, 2, 8
+        q = torch.randn(b, s, h, d)
+        k = torch.randn(b, s, h, d)
+        v = torch.randn(b, s, h, d)
+        # Mask that blocks all positions -> output should be mean of V
+        mask = torch.zeros(b, 1, s, s)  # no masking
+        out = self.mod._diffusers_eager_attention(q, k, v, attn_mask=mask)
+        assert out.shape == (b, s, h, d)
+
+    def test_eager_attention_gqa(self):
+        """GQA: fewer KV heads are repeated to match Q heads."""
+        b, s, hq, hkv, d = 1, 4, 8, 2, 16
+        q = torch.randn(b, s, hq, d)
+        k = torch.randn(b, s, hkv, d)
+        v = torch.randn(b, s, hkv, d)
+
+        out = self.mod._diffusers_eager_attention(q, k, v, enable_gqa=True)
+        assert out.shape == (b, s, hq, d)
+
+    def test_register_idempotent(self):
+        """Registration is safe to call multiple times."""
+        self.mod.register_diffusers_eager_attention()
+        self.mod.register_diffusers_eager_attention()  # second call should not raise
+
+    def test_get_backend_before_register_raises(self):
+        """Getting backend before registration raises RuntimeError."""
+        self.mod._BACKEND_REGISTERED = False
+        with pytest.raises(RuntimeError, match="not registered"):
+            self.mod.get_skip_softmax_attention_backend()
+
+
+# ---------------------------------------------------------------------------
+# Tests: diffusers triton attention
+# ---------------------------------------------------------------------------
+
+
+class TestDiffusersTritonAttention:
+    """Test diffusers Triton attention backend with mocked dependencies."""
+
+    @pytest.fixture(autouse=True)
+    def _setup_mocks(self):
+        """Inject mock diffusers and triton_fa modules."""
+        mocks = _make_mock_diffusers()
+        mod_name = "modelopt.torch.sparsity.attention_sparsity.kernels.diffusers_triton_attention"
+        sys.modules.pop(mod_name, None)
+
+        # Mock the triton_fa.attention function
+        def fake_attention(q, k, v, **kw):
+            return q  # just return q as output
+
+        mocks["modelopt.torch.kernels.triton_fa"] = types.ModuleType(
+            "modelopt.torch.kernels.triton_fa"
+        )
+        mocks["modelopt.torch.kernels.triton_fa"].attention = fake_attention
+
+        with patch.dict(sys.modules, mocks):
+            self.mod = importlib.import_module(mod_name)
+            yield
+        sys.modules.pop(mod_name, None)
+
+    def test_triton_attention_basic(self):
+        """Triton attention reshapes correctly [B,S,H,D] -> varlen -> [B,S,H,D]."""
+        b, s, h, d = 2, 8, 4, 16
+        q = torch.randn(b, s, h, d)
+        k = torch.randn(b, s, h, d)
+        v = torch.randn(b, s, h, d)
+
+        out = self.mod._diffusers_triton_attention(q, k, v)
+        assert out.shape == (b, s, h, d)
+
+    def test_triton_attention_cross_attention(self):
+        """Different Q/KV sequence lengths produce separate varlen metadata."""
+        b, sq, sk, h, d = 1, 4, 12, 2, 8
+        q = torch.randn(b, sq, h, d)
+        k = torch.randn(b, sk, h, d)
+        v = torch.randn(b, sk, h, d)
+
+        out = self.mod._diffusers_triton_attention(q, k, v)
+        assert out.shape == (b, sq, h, d)
+
+    def test_set_clear_config(self):
+        """Thread-local config set/clear cycle."""
+        self.mod.set_triton_skip_softmax_config(threshold=0.1)
+        assert self.mod._thread_local.skip_threshold == 0.1
+        self.mod.clear_triton_skip_softmax_config()
+        assert self.mod._thread_local.skip_threshold is None
+
+    def test_threshold_passed_to_kernel(self):
+        """When threshold is set, it appears in kernel kwargs."""
+        captured_kw = {}
+        original_attention = self.mod.attention
+
+        def spy_attention(q, k, v, **kw):
+            captured_kw.update(kw)
+            return q
+
+        self.mod.attention = spy_attention
+        try:
+            self.mod.set_triton_skip_softmax_config(threshold=0.05)
+            b, s, h, d = 1, 4, 2, 8
+            q = torch.randn(b, s, h, d)
+            self.mod._diffusers_triton_attention(q, q, q)
+            assert captured_kw.get("skip_softmax_threshold") == 0.05
+        finally:
+            self.mod.attention = original_attention
+            self.mod.clear_triton_skip_softmax_config()
+
+    def test_register_idempotent(self):
+        """Registration is safe to call multiple times."""
+        self.mod.register_diffusers_triton_attention()
+        self.mod.register_diffusers_triton_attention()
+
+    def test_get_backend_before_register_raises(self):
+        """Getting backend before registration raises RuntimeError."""
+        self.mod._BACKEND_REGISTERED = False
+        with pytest.raises(RuntimeError, match="not registered"):
+            self.mod.get_triton_attention_backend()
+
+
+# ---------------------------------------------------------------------------
+# Tests: LTX eager attention
+# ---------------------------------------------------------------------------
+
+
+class TestLTXEagerAttention:
+    """Test LTX-2 eager attention backend with mocked ltx_core."""
+
+    @pytest.fixture(autouse=True)
+    def _setup_mocks(self):
+        """Inject mock ltx_core modules."""
+        mocks = _make_mock_ltx_core()
+        mod_name = "modelopt.torch.sparsity.attention_sparsity.kernels.ltx_eager_attention"
+        sys.modules.pop(mod_name, None)
+        with patch.dict(sys.modules, mocks):
+            self.mod = importlib.import_module(mod_name)
+            self.FakeAttention = mocks["ltx_core.model.transformer.attention"].Attention
+            yield
+        sys.modules.pop(mod_name, None)
+
+    def test_eager_attention_basic(self):
+        """LTX eager attention: [B, T, H*D] -> [B, T, H*D]."""
+        b, t, h, d = 2, 8, 4, 16
+        q = torch.randn(b, t, h * d)
+        k = torch.randn(b, t, h * d)
+        v = torch.randn(b, t, h * d)
+
+        out = self.mod._ltx_eager_attention(q, k, v, heads=h)
+        assert out.shape == (b, t, h * d)
+
+    def test_eager_attention_with_mask(self):
+        """LTX eager attention handles 2D and 3D masks."""
+        b, t, h, d = 1, 4, 2, 8
+        q = torch.randn(b, t, h * d)
+        k = torch.randn(b, t, h * d)
+        v = torch.randn(b, t, h * d)
+
+        # 2D mask [t, t]
+        mask_2d = torch.zeros(t, t)
+        out = self.mod._ltx_eager_attention(q, k, v, heads=h, mask=mask_2d)
+        assert out.shape == (b, t, h * d)
+
+        # 3D mask [b, t, t]
+        mask_3d = torch.zeros(b, t, t)
+        out = self.mod._ltx_eager_attention(q, k, v, heads=h, mask=mask_3d)
+        assert out.shape == (b, t, h * d)
+
+    def test_wrapper_routes_to_eager_when_active(self):
+        """Wrapper calls eager attention when skip-softmax context is active."""
+        from modelopt.torch.sparsity.attention_sparsity.kernels import set_skip_softmax_context
+
+        original_fn = MagicMock(return_value=torch.zeros(1, 4, 32))
+        wrapper = self.mod._SkipSoftmaxLTXAttentionWrapper(original_fn)
+
+        b, t, h, d = 1, 4, 2, 16
+        q = torch.randn(b, t, h * d)
+        k = torch.randn(b, t, h * d)
+        v = torch.randn(b, t, h * d)
+
+        # Inactive: calls original
+        out = wrapper(q, k, v, heads=h)
+        original_fn.assert_called_once()
+
+        # Active: calls eager (not original)
+        original_fn.reset_mock()
+        set_skip_softmax_context(True)
+        try:
+            out = wrapper(q, k, v, heads=h)
+            original_fn.assert_not_called()
+            assert out.shape == (b, t, h * d)
+        finally:
+            set_skip_softmax_context(False)
+
+    def test_register_patches_attention_modules(self):
+        """register_ltx_eager_attention patches Attention modules in model."""
+        model = nn.Sequential()
+        attn = self.FakeAttention()
+        model.add_module("attn", attn)
+
+        self.mod.register_ltx_eager_attention(model)
+
+        assert isinstance(attn.attention_function, self.mod._SkipSoftmaxLTXAttentionWrapper)
+
+        # Idempotent: second call doesn't double-wrap
+        self.mod.register_ltx_eager_attention(model)
+        assert isinstance(attn.attention_function, self.mod._SkipSoftmaxLTXAttentionWrapper)
+
+
+# ---------------------------------------------------------------------------
+# Tests: LTX triton attention
+# ---------------------------------------------------------------------------
+
+
+class TestLTXTritonAttention:
+    """Test LTX-2 Triton attention backend with mocked dependencies."""
+
+    @pytest.fixture(autouse=True)
+    def _setup_mocks(self):
+        """Inject mock ltx_core and triton_fa modules."""
+        mocks = _make_mock_ltx_core()
+        mod_name = "modelopt.torch.sparsity.attention_sparsity.kernels.ltx_triton_attention"
+        sys.modules.pop(mod_name, None)
+
+        def fake_attention(q, k, v, **kw):
+            return q
+
+        mocks["modelopt.torch.kernels.triton_fa"] = types.ModuleType(
+            "modelopt.torch.kernels.triton_fa"
+        )
+        mocks["modelopt.torch.kernels.triton_fa"].attention = fake_attention
+
+        with patch.dict(sys.modules, mocks):
+            self.mod = importlib.import_module(mod_name)
+            self.FakeAttention = mocks["ltx_core.model.transformer.attention"].Attention
+            yield
+        sys.modules.pop(mod_name, None)
+
+    def test_triton_attention_basic(self):
+        """LTX triton attention: [B, T, H*D] -> varlen -> [B, T, H*D]."""
+        b, t, h, d = 2, 8, 4, 16
+        q = torch.randn(b, t, h * d)
+        k = torch.randn(b, t, h * d)
+        v = torch.randn(b, t, h * d)
+
+        out = self.mod._ltx_triton_attention(q, k, v, heads=h, threshold=0.1)
+        assert out.shape == (b, t, h * d)
+
+    def test_set_clear_context(self):
+        """Thread-local context set/clear cycle."""
+        self.mod.set_ltx_triton_context(active=True, threshold=0.05)
+        active, threshold = self.mod._get_ltx_triton_context()
+        assert active is True
+        assert threshold == 0.05
+
+        self.mod.clear_ltx_triton_context()
+        active, threshold = self.mod._get_ltx_triton_context()
+        assert active is False
+        assert threshold is None
+
+    def test_wrapper_routes_to_triton_when_active(self):
+        """Wrapper calls Triton attention when context is active."""
+        original_fn = MagicMock(return_value=torch.zeros(1, 4, 32))
+        wrapper = self.mod._TritonLTXAttentionWrapper(original_fn)
+
+        b, t, h, d = 1, 4, 2, 16
+        q = torch.randn(b, t, h * d)
+        k = torch.randn(b, t, h * d)
+        v = torch.randn(b, t, h * d)
+
+        # Inactive: calls original
+        out = wrapper(q, k, v, heads=h)
+        original_fn.assert_called_once()
+
+        # Active: calls triton (not original)
+        original_fn.reset_mock()
+        self.mod.set_ltx_triton_context(active=True, threshold=0.1)
+        try:
+            out = wrapper(q, k, v, heads=h)
+            original_fn.assert_not_called()
+            assert out.shape == (b, t, h * d)
+        finally:
+            self.mod.clear_ltx_triton_context()
+
+    def test_register_patches_attention_modules(self):
+        """register_ltx_triton_attention patches Attention modules."""
+        model = nn.Sequential()
+        attn = self.FakeAttention()
+        model.add_module("attn", attn)
+
+        self.mod.register_ltx_triton_attention(model)
+        assert isinstance(attn.attention_function, self.mod._TritonLTXAttentionWrapper)
+
+        # Idempotent
+        self.mod.register_ltx_triton_attention(model)
+        assert isinstance(attn.attention_function, self.mod._TritonLTXAttentionWrapper)
+
+    def test_threshold_passed_to_kernel(self):
+        """When threshold is set, it appears in kernel kwargs."""
+        captured_kw = {}
+        original_attention = self.mod.attention
+
+        def spy_attention(q, k, v, **kw):
+            captured_kw.update(kw)
+            return q
+
+        self.mod.attention = spy_attention
+        try:
+            b, t, h, d = 1, 4, 2, 8
+            q = torch.randn(b, t, h * d)
+            self.mod._ltx_triton_attention(q, q, q, heads=h, threshold=0.07)
+            assert captured_kw.get("skip_softmax_threshold") == 0.07
+        finally:
+            self.mod.attention = original_attention
+
+
+# ---------------------------------------------------------------------------
+# Tests: conversion.py _register_diffusers_backends_if_needed
+# ---------------------------------------------------------------------------
+
+
+class TestRegisterDiffusersBackends:
+    """Test _register_diffusers_backends_if_needed with mocked imports."""
+
+    def test_no_diffusers_no_error(self):
+        """When diffusers is not installed, function completes without error."""
+        from modelopt.torch.sparsity.attention_sparsity.conversion import (
+            _register_diffusers_backends_if_needed,
+        )
+
+        model = nn.Linear(10, 10)
+        # Should not raise even if diffusers is not installed
+        _register_diffusers_backends_if_needed(model)
+
+    def test_with_diffusers_model(self):
+        """When model is a diffusers ModelMixin, backends are registered."""
+        from modelopt.torch.sparsity.attention_sparsity.conversion import (
+            _register_diffusers_backends_if_needed,
+        )
+
+        # Create a fake ModelMixin so isinstance check passes
+        mock_mixin = type("ModelMixin", (nn.Module,), {})
+        mock_modeling_utils = types.ModuleType("diffusers.models.modeling_utils")
+        mock_modeling_utils.ModelMixin = mock_mixin
+
+        fake_model = mock_mixin()
+
+        with (
+            patch.dict(sys.modules, {"diffusers.models.modeling_utils": mock_modeling_utils}),
+            patch(
+                "modelopt.torch.sparsity.attention_sparsity.kernels.register_diffusers_eager_attention",
+                MagicMock(),
+            ) as mock_eager,
+            patch(
+                "modelopt.torch.sparsity.attention_sparsity.kernels.register_diffusers_triton_attention",
+                MagicMock(),
+            ) as mock_triton,
+        ):
+            _register_diffusers_backends_if_needed(fake_model)
+            mock_eager.assert_called_once()
+            mock_triton.assert_called_once()

From 5873652c098ac428ffec6cbf04795fdf866f35f8 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Thu, 2 Apr 2026 08:36:35 +0000
Subject: [PATCH 03/21] Fixed error

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 .../sparsity/attention_sparsity/conversion.py |   7 +-
 .../attention_sparsity/kernels/__init__.py    |   9 +-
 .../kernels/diffusers_triton_attention.py     |   3 +-
 .../kernels/ltx_triton_attention.py           |   3 +-
 .../test_kernel_backends.py                   | 413 +++++++-----------
 5 files changed, 175 insertions(+), 260 deletions(-)

diff --git a/modelopt/torch/sparsity/attention_sparsity/conversion.py b/modelopt/torch/sparsity/attention_sparsity/conversion.py
index 8bd6a9b96a..c8a8aea605 100644
--- a/modelopt/torch/sparsity/attention_sparsity/conversion.py
+++ b/modelopt/torch/sparsity/attention_sparsity/conversion.py
@@ -121,13 +121,16 @@ def _register_diffusers_backends_if_needed(model: nn.Module) -> None:
                 register_diffusers_eager_attention()
             if register_diffusers_triton_attention is not None:
                 register_diffusers_triton_attention()
-    except ImportError:
+    except (ImportError, Exception):
         pass
 
     # Patch ltx_core Attention modules if present (independent of diffusers)
     import contextlib
 
-    from .kernels import register_ltx_eager_attention, register_ltx_triton_attention
+    try:
+        from .kernels import register_ltx_eager_attention, register_ltx_triton_attention
+    except (ImportError, RuntimeError):
+        return
 
     if register_ltx_eager_attention is not None:
         with contextlib.suppress(Exception):
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py b/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py
index bb7e921a13..81f4295bb4 100644
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py
@@ -47,16 +47,17 @@ def get_skip_softmax_context() -> bool:
 register_ltx_eager_attention = None
 register_ltx_triton_attention = None
 
-with contextlib.suppress(ImportError):
+# Suppress ImportError (missing package) and RuntimeError (triton without GPU driver)
+with contextlib.suppress(ImportError, RuntimeError):
     from .diffusers_eager_attention import register_diffusers_eager_attention
 
-with contextlib.suppress(ImportError):
+with contextlib.suppress(ImportError, RuntimeError):
     from .diffusers_triton_attention import register_diffusers_triton_attention
 
-with contextlib.suppress(ImportError):
+with contextlib.suppress(ImportError, RuntimeError):
     from .ltx_eager_attention import register_ltx_eager_attention
 
-with contextlib.suppress(ImportError):
+with contextlib.suppress(ImportError, RuntimeError):
     from .ltx_triton_attention import register_ltx_triton_attention
 
 __all__ = [
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
index 91131f3205..17fec4e4eb 100644
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
@@ -33,7 +33,7 @@
     attention_backend,
 )
 
-from modelopt.torch.kernels.triton_fa import attention
+from modelopt.torch.kernels import attention
 
 _BACKEND_NAME = "modelopt_triton"
 _BACKEND_REGISTERED = False
@@ -110,6 +110,7 @@ def _diffusers_triton_attention(
     if threshold is not None and threshold > 0.0:
         kw["skip_softmax_threshold"] = threshold
 
+    assert attention is not None, "Triton attention kernel not available (requires CUDA + triton)"
     o = attention(q, k, v, **kw)
 
     # Reshape back: [B*S, H, D] -> [B, S, H, D]
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
index 1493fe8a0f..ddb880026c 100644
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
@@ -28,7 +28,7 @@
 import torch
 from ltx_core.model.transformer.attention import Attention
 
-from modelopt.torch.kernels.triton_fa import attention
+from modelopt.torch.kernels import attention
 
 # Thread-local storage for skip-softmax configuration
 _thread_local = threading.local()
@@ -106,6 +106,7 @@ def _ltx_triton_attention(
     if threshold is not None and threshold > 0.0:
         kw["skip_softmax_threshold"] = threshold
 
+    assert attention is not None, "Triton attention kernel not available (requires CUDA + triton)"
     o = attention(q_flat, k_flat, v_flat, **kw)
 
     # Reshape back: [B*T, H, D] -> [B, T, H*D]
diff --git a/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py b/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
index b6f4119eb5..22ea4580b1 100644
--- a/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
+++ b/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
@@ -16,7 +16,7 @@
 """Unit tests for diffusers/LTX kernel backends with mocked dependencies.
 
 These tests verify the attention computation logic and registration without
-requiring diffusers or ltx_core to be installed.
+requiring diffusers, ltx_core, or a GPU (triton driver).
 """
 
 import importlib
@@ -29,90 +29,133 @@
 import torch.nn as nn
 
 # ---------------------------------------------------------------------------
-# Helpers: mock diffusers and ltx_core modules before importing backends
+# Module names that must be cleaned from sys.modules between tests
 # ---------------------------------------------------------------------------
+_KERNELS_PKG = "modelopt.torch.sparsity.attention_sparsity.kernels"
+_ALL_KERNEL_MODS = [
+    _KERNELS_PKG,
+    f"{_KERNELS_PKG}.diffusers_eager_attention",
+    f"{_KERNELS_PKG}.diffusers_triton_attention",
+    f"{_KERNELS_PKG}.ltx_eager_attention",
+    f"{_KERNELS_PKG}.ltx_triton_attention",
+]
+
+
+def _purge_kernel_modules():
+    """Remove all kernel backend modules from sys.modules."""
+    for name in _ALL_KERNEL_MODS:
+        sys.modules.pop(name, None)
+
+
+# ---------------------------------------------------------------------------
+# Helpers: build mock module dicts
+# ---------------------------------------------------------------------------
+
+
+def _make_base_mocks():
+    """Mocks needed by every test: modelopt.torch.kernels + triton_fa."""
+    mock_kernels = types.ModuleType("modelopt.torch.kernels")
+
+    def fake_attention(q, k, v, **kw):
+        return q
+
+    mock_kernels.IS_AVAILABLE = True
+    mock_kernels.attention = fake_attention
+    mock_kernels.register_triton_attention = None
+
+    mock_triton_fa = types.ModuleType("modelopt.torch.kernels.triton_fa")
+    mock_triton_fa.attention = fake_attention
+
+    return {
+        "modelopt.torch.kernels": mock_kernels,
+        "modelopt.torch.kernels.triton_fa": mock_triton_fa,
+    }
 
 
 def _make_mock_diffusers():
-    """Create a mock diffusers module hierarchy for attention_dispatch."""
+    """Mock diffusers.models.attention_dispatch."""
     mock_diffusers = types.ModuleType("diffusers")
     mock_models = types.ModuleType("diffusers.models")
-    mock_attention_dispatch = types.ModuleType("diffusers.models.attention_dispatch")
+    mock_ad = types.ModuleType("diffusers.models.attention_dispatch")
 
-    # Create a real-ish AttentionBackendName enum mock
-    class FakeAttentionBackendName(str):
-        _member_map_ = {}
-        _value2member_map_ = {}
+    class FakeBackendName(str):
+        _member_map_: dict = {}
+        _value2member_map_: dict = {}
 
-    mock_attention_dispatch.AttentionBackendName = FakeAttentionBackendName
+    mock_ad.AttentionBackendName = FakeBackendName
 
     class FakeRegistry:
-        _backends = {}
-        _constraints = {}
-        _supported_arg_names = {}
+        _backends: dict = {}
+        _constraints: dict = {}
+        _supported_arg_names: dict = {}
 
-    mock_attention_dispatch._AttentionBackendRegistry = FakeRegistry
-    mock_attention_dispatch.attention_backend = MagicMock()
+    mock_ad._AttentionBackendRegistry = FakeRegistry
+    mock_ad.attention_backend = MagicMock()
 
     mock_diffusers.models = mock_models
-    mock_models.attention_dispatch = mock_attention_dispatch
-
+    mock_models.attention_dispatch = mock_ad
     return {
         "diffusers": mock_diffusers,
         "diffusers.models": mock_models,
-        "diffusers.models.attention_dispatch": mock_attention_dispatch,
+        "diffusers.models.attention_dispatch": mock_ad,
     }
 
 
 def _make_mock_ltx_core():
-    """Create a mock ltx_core module hierarchy."""
+    """Mock ltx_core.model.transformer.attention."""
     mock_ltx = types.ModuleType("ltx_core")
     mock_model = types.ModuleType("ltx_core.model")
-    mock_transformer = types.ModuleType("ltx_core.model.transformer")
-    mock_attn_mod = types.ModuleType("ltx_core.model.transformer.attention")
+    mock_tf = types.ModuleType("ltx_core.model.transformer")
+    mock_attn = types.ModuleType("ltx_core.model.transformer.attention")
 
     class FakeAttention(nn.Module):
         def __init__(self):
             super().__init__()
             self.attention_function = lambda q, k, v, heads, mask=None: q
 
-    mock_attn_mod.Attention = FakeAttention
-
+    mock_attn.Attention = FakeAttention
     mock_ltx.model = mock_model
-    mock_model.transformer = mock_transformer
-    mock_transformer.attention = mock_attn_mod
-
+    mock_model.transformer = mock_tf
+    mock_tf.attention = mock_attn
     return {
         "ltx_core": mock_ltx,
         "ltx_core.model": mock_model,
-        "ltx_core.model.transformer": mock_transformer,
-        "ltx_core.model.transformer.attention": mock_attn_mod,
+        "ltx_core.model.transformer": mock_tf,
+        "ltx_core.model.transformer.attention": mock_attn,
     }
 
 
+def _import_fresh(mod_name: str, extra_mocks: dict):
+    """Purge kernel modules, patch sys.modules, and reimport ``mod_name``."""
+    _purge_kernel_modules()
+    mocks = {**_make_base_mocks(), **extra_mocks}
+    with patch.dict(sys.modules, mocks):
+        # Reimport the parent package first so submodule imports resolve
+        kernels_pkg = importlib.import_module(_KERNELS_PKG)
+        mod = importlib.import_module(mod_name)
+    return kernels_pkg, mod
+
+
 # ---------------------------------------------------------------------------
 # Tests: kernels/__init__.py thread-local context
 # ---------------------------------------------------------------------------
 
 
 class TestSkipSoftmaxContext:
-    """Test thread-local skip-softmax context in kernels/__init__.py."""
+    @pytest.fixture(autouse=True)
+    def _setup(self):
+        self.kernels, _ = _import_fresh(_KERNELS_PKG, {})
+        yield
+        _purge_kernel_modules()
 
     def test_default_is_false(self):
-        from modelopt.torch.sparsity.attention_sparsity.kernels import get_skip_softmax_context
-
-        assert get_skip_softmax_context() is False
+        assert self.kernels.get_skip_softmax_context() is False
 
     def test_set_and_get(self):
-        from modelopt.torch.sparsity.attention_sparsity.kernels import (
-            get_skip_softmax_context,
-            set_skip_softmax_context,
-        )
-
-        set_skip_softmax_context(True)
-        assert get_skip_softmax_context() is True
-        set_skip_softmax_context(False)
-        assert get_skip_softmax_context() is False
+        self.kernels.set_skip_softmax_context(True)
+        assert self.kernels.get_skip_softmax_context() is True
+        self.kernels.set_skip_softmax_context(False)
+        assert self.kernels.get_skip_softmax_context() is False
 
 
 # ---------------------------------------------------------------------------
@@ -121,78 +164,55 @@ def test_set_and_get(self):
 
 
 class TestDiffusersEagerAttention:
-    """Test diffusers eager attention backend with mocked diffusers imports."""
-
     @pytest.fixture(autouse=True)
-    def _setup_mocks(self):
-        """Inject mock diffusers modules and reimport the backend."""
-        mocks = _make_mock_diffusers()
-        mod_name = "modelopt.torch.sparsity.attention_sparsity.kernels.diffusers_eager_attention"
-        # Remove cached module so reimport picks up mocks
-        sys.modules.pop(mod_name, None)
-        with patch.dict(sys.modules, mocks):
-            self.mod = importlib.import_module(mod_name)
-            yield
-        sys.modules.pop(mod_name, None)
-
-    def test_eager_attention_basic(self):
-        """Eager attention produces correct output shape [B, S, H, D]."""
+    def _setup(self):
+        _, self.mod = _import_fresh(
+            f"{_KERNELS_PKG}.diffusers_eager_attention", _make_mock_diffusers()
+        )
+        yield
+        _purge_kernel_modules()
+
+    def test_basic(self):
         b, s, h, d = 2, 8, 4, 16
         q = torch.randn(b, s, h, d)
-        k = torch.randn(b, s, h, d)
-        v = torch.randn(b, s, h, d)
-
-        out = self.mod._diffusers_eager_attention(q, k, v)
+        out = self.mod._diffusers_eager_attention(q, q, q)
         assert out.shape == (b, s, h, d)
 
-    def test_eager_attention_cross_attention(self):
-        """Eager attention handles different Q/KV sequence lengths."""
+    def test_cross_attention(self):
         b, sq, sk, h, d = 1, 4, 12, 2, 8
         q = torch.randn(b, sq, h, d)
         k = torch.randn(b, sk, h, d)
         v = torch.randn(b, sk, h, d)
-
         out = self.mod._diffusers_eager_attention(q, k, v)
         assert out.shape == (b, sq, h, d)
 
-    def test_eager_attention_with_causal_mask(self):
-        """Causal mask produces lower-triangular attention pattern."""
+    def test_causal_mask(self):
         b, s, h, d = 1, 4, 1, 8
         q = torch.randn(b, s, h, d)
-        k = torch.randn(b, s, h, d)
         v = torch.eye(s).unsqueeze(0).unsqueeze(2).expand(b, s, h, s)
-        # With identity V and causal, output should reflect causal structure
-        out = self.mod._diffusers_eager_attention(q, k, v, is_causal=True)
+        out = self.mod._diffusers_eager_attention(q, q, v, is_causal=True)
         assert out.shape == (b, s, h, s)
 
-    def test_eager_attention_with_mask(self):
-        """Attention mask is applied correctly."""
+    def test_attn_mask(self):
         b, s, h, d = 1, 4, 2, 8
         q = torch.randn(b, s, h, d)
-        k = torch.randn(b, s, h, d)
-        v = torch.randn(b, s, h, d)
-        # Mask that blocks all positions -> output should be mean of V
-        mask = torch.zeros(b, 1, s, s)  # no masking
-        out = self.mod._diffusers_eager_attention(q, k, v, attn_mask=mask)
+        mask = torch.zeros(b, 1, s, s)
+        out = self.mod._diffusers_eager_attention(q, q, q, attn_mask=mask)
         assert out.shape == (b, s, h, d)
 
-    def test_eager_attention_gqa(self):
-        """GQA: fewer KV heads are repeated to match Q heads."""
+    def test_gqa(self):
         b, s, hq, hkv, d = 1, 4, 8, 2, 16
         q = torch.randn(b, s, hq, d)
         k = torch.randn(b, s, hkv, d)
         v = torch.randn(b, s, hkv, d)
-
         out = self.mod._diffusers_eager_attention(q, k, v, enable_gqa=True)
         assert out.shape == (b, s, hq, d)
 
     def test_register_idempotent(self):
-        """Registration is safe to call multiple times."""
         self.mod.register_diffusers_eager_attention()
-        self.mod.register_diffusers_eager_attention()  # second call should not raise
+        self.mod.register_diffusers_eager_attention()
 
     def test_get_backend_before_register_raises(self):
-        """Getting backend before registration raises RuntimeError."""
         self.mod._BACKEND_REGISTERED = False
         with pytest.raises(RuntimeError, match="not registered"):
             self.mod.get_skip_softmax_attention_backend()
@@ -204,83 +224,52 @@ def test_get_backend_before_register_raises(self):
 
 
 class TestDiffusersTritonAttention:
-    """Test diffusers Triton attention backend with mocked dependencies."""
-
     @pytest.fixture(autouse=True)
-    def _setup_mocks(self):
-        """Inject mock diffusers and triton_fa modules."""
-        mocks = _make_mock_diffusers()
-        mod_name = "modelopt.torch.sparsity.attention_sparsity.kernels.diffusers_triton_attention"
-        sys.modules.pop(mod_name, None)
-
-        # Mock the triton_fa.attention function
-        def fake_attention(q, k, v, **kw):
-            return q  # just return q as output
-
-        mocks["modelopt.torch.kernels.triton_fa"] = types.ModuleType(
-            "modelopt.torch.kernels.triton_fa"
+    def _setup(self):
+        _, self.mod = _import_fresh(
+            f"{_KERNELS_PKG}.diffusers_triton_attention", _make_mock_diffusers()
         )
-        mocks["modelopt.torch.kernels.triton_fa"].attention = fake_attention
+        yield
+        _purge_kernel_modules()
 
-        with patch.dict(sys.modules, mocks):
-            self.mod = importlib.import_module(mod_name)
-            yield
-        sys.modules.pop(mod_name, None)
-
-    def test_triton_attention_basic(self):
-        """Triton attention reshapes correctly [B,S,H,D] -> varlen -> [B,S,H,D]."""
+    def test_basic(self):
         b, s, h, d = 2, 8, 4, 16
         q = torch.randn(b, s, h, d)
-        k = torch.randn(b, s, h, d)
-        v = torch.randn(b, s, h, d)
-
-        out = self.mod._diffusers_triton_attention(q, k, v)
+        out = self.mod._diffusers_triton_attention(q, q, q)
         assert out.shape == (b, s, h, d)
 
-    def test_triton_attention_cross_attention(self):
-        """Different Q/KV sequence lengths produce separate varlen metadata."""
+    def test_cross_attention(self):
         b, sq, sk, h, d = 1, 4, 12, 2, 8
         q = torch.randn(b, sq, h, d)
         k = torch.randn(b, sk, h, d)
         v = torch.randn(b, sk, h, d)
-
         out = self.mod._diffusers_triton_attention(q, k, v)
         assert out.shape == (b, sq, h, d)
 
     def test_set_clear_config(self):
-        """Thread-local config set/clear cycle."""
         self.mod.set_triton_skip_softmax_config(threshold=0.1)
         assert self.mod._thread_local.skip_threshold == 0.1
         self.mod.clear_triton_skip_softmax_config()
         assert self.mod._thread_local.skip_threshold is None
 
-    def test_threshold_passed_to_kernel(self):
-        """When threshold is set, it appears in kernel kwargs."""
-        captured_kw = {}
-        original_attention = self.mod.attention
-
-        def spy_attention(q, k, v, **kw):
-            captured_kw.update(kw)
-            return q
-
-        self.mod.attention = spy_attention
+    def test_threshold_forwarded(self):
+        captured = {}
+        orig = self.mod.attention
+        self.mod.attention = lambda q, k, v, **kw: (captured.update(kw), q)[1]
         try:
             self.mod.set_triton_skip_softmax_config(threshold=0.05)
-            b, s, h, d = 1, 4, 2, 8
-            q = torch.randn(b, s, h, d)
+            q = torch.randn(1, 4, 2, 8)
             self.mod._diffusers_triton_attention(q, q, q)
-            assert captured_kw.get("skip_softmax_threshold") == 0.05
+            assert captured.get("skip_softmax_threshold") == 0.05
         finally:
-            self.mod.attention = original_attention
+            self.mod.attention = orig
             self.mod.clear_triton_skip_softmax_config()
 
     def test_register_idempotent(self):
-        """Registration is safe to call multiple times."""
         self.mod.register_diffusers_triton_attention()
         self.mod.register_diffusers_triton_attention()
 
     def test_get_backend_before_register_raises(self):
-        """Getting backend before registration raises RuntimeError."""
         self.mod._BACKEND_REGISTERED = False
         with pytest.raises(RuntimeError, match="not registered"):
             self.mod.get_triton_attention_backend()
@@ -292,86 +281,53 @@ def test_get_backend_before_register_raises(self):
 
 
 class TestLTXEagerAttention:
-    """Test LTX-2 eager attention backend with mocked ltx_core."""
-
     @pytest.fixture(autouse=True)
-    def _setup_mocks(self):
-        """Inject mock ltx_core modules."""
-        mocks = _make_mock_ltx_core()
-        mod_name = "modelopt.torch.sparsity.attention_sparsity.kernels.ltx_eager_attention"
-        sys.modules.pop(mod_name, None)
-        with patch.dict(sys.modules, mocks):
-            self.mod = importlib.import_module(mod_name)
-            self.FakeAttention = mocks["ltx_core.model.transformer.attention"].Attention
-            yield
-        sys.modules.pop(mod_name, None)
-
-    def test_eager_attention_basic(self):
-        """LTX eager attention: [B, T, H*D] -> [B, T, H*D]."""
+    def _setup(self):
+        ltx_mocks = _make_mock_ltx_core()
+        self.kernels, self.mod = _import_fresh(f"{_KERNELS_PKG}.ltx_eager_attention", ltx_mocks)
+        self.FakeAttention = ltx_mocks["ltx_core.model.transformer.attention"].Attention
+        yield
+        _purge_kernel_modules()
+
+    def test_basic(self):
         b, t, h, d = 2, 8, 4, 16
         q = torch.randn(b, t, h * d)
-        k = torch.randn(b, t, h * d)
-        v = torch.randn(b, t, h * d)
-
-        out = self.mod._ltx_eager_attention(q, k, v, heads=h)
+        out = self.mod._ltx_eager_attention(q, q, q, heads=h)
         assert out.shape == (b, t, h * d)
 
-    def test_eager_attention_with_mask(self):
-        """LTX eager attention handles 2D and 3D masks."""
+    def test_masks(self):
         b, t, h, d = 1, 4, 2, 8
         q = torch.randn(b, t, h * d)
-        k = torch.randn(b, t, h * d)
-        v = torch.randn(b, t, h * d)
-
-        # 2D mask [t, t]
-        mask_2d = torch.zeros(t, t)
-        out = self.mod._ltx_eager_attention(q, k, v, heads=h, mask=mask_2d)
+        out = self.mod._ltx_eager_attention(q, q, q, heads=h, mask=torch.zeros(t, t))
         assert out.shape == (b, t, h * d)
-
-        # 3D mask [b, t, t]
-        mask_3d = torch.zeros(b, t, t)
-        out = self.mod._ltx_eager_attention(q, k, v, heads=h, mask=mask_3d)
+        out = self.mod._ltx_eager_attention(q, q, q, heads=h, mask=torch.zeros(b, t, t))
         assert out.shape == (b, t, h * d)
 
-    def test_wrapper_routes_to_eager_when_active(self):
-        """Wrapper calls eager attention when skip-softmax context is active."""
-        from modelopt.torch.sparsity.attention_sparsity.kernels import set_skip_softmax_context
-
+    def test_wrapper_routing(self):
         original_fn = MagicMock(return_value=torch.zeros(1, 4, 32))
         wrapper = self.mod._SkipSoftmaxLTXAttentionWrapper(original_fn)
-
         b, t, h, d = 1, 4, 2, 16
         q = torch.randn(b, t, h * d)
-        k = torch.randn(b, t, h * d)
-        v = torch.randn(b, t, h * d)
 
-        # Inactive: calls original
-        out = wrapper(q, k, v, heads=h)
+        wrapper(q, q, q, heads=h)
         original_fn.assert_called_once()
 
-        # Active: calls eager (not original)
         original_fn.reset_mock()
-        set_skip_softmax_context(True)
+        self.kernels.set_skip_softmax_context(True)
         try:
-            out = wrapper(q, k, v, heads=h)
+            out = wrapper(q, q, q, heads=h)
             original_fn.assert_not_called()
             assert out.shape == (b, t, h * d)
         finally:
-            set_skip_softmax_context(False)
+            self.kernels.set_skip_softmax_context(False)
 
-    def test_register_patches_attention_modules(self):
-        """register_ltx_eager_attention patches Attention modules in model."""
+    def test_register_idempotent(self):
         model = nn.Sequential()
-        attn = self.FakeAttention()
-        model.add_module("attn", attn)
-
+        model.add_module("attn", self.FakeAttention())
         self.mod.register_ltx_eager_attention(model)
-
-        assert isinstance(attn.attention_function, self.mod._SkipSoftmaxLTXAttentionWrapper)
-
-        # Idempotent: second call doesn't double-wrap
+        assert isinstance(model.attn.attention_function, self.mod._SkipSoftmaxLTXAttentionWrapper)
         self.mod.register_ltx_eager_attention(model)
-        assert isinstance(attn.attention_function, self.mod._SkipSoftmaxLTXAttentionWrapper)
+        assert isinstance(model.attn.attention_function, self.mod._SkipSoftmaxLTXAttentionWrapper)
 
 
 # ---------------------------------------------------------------------------
@@ -380,105 +336,66 @@ def test_register_patches_attention_modules(self):
 
 
 class TestLTXTritonAttention:
-    """Test LTX-2 Triton attention backend with mocked dependencies."""
-
     @pytest.fixture(autouse=True)
-    def _setup_mocks(self):
-        """Inject mock ltx_core and triton_fa modules."""
-        mocks = _make_mock_ltx_core()
-        mod_name = "modelopt.torch.sparsity.attention_sparsity.kernels.ltx_triton_attention"
-        sys.modules.pop(mod_name, None)
-
-        def fake_attention(q, k, v, **kw):
-            return q
-
-        mocks["modelopt.torch.kernels.triton_fa"] = types.ModuleType(
-            "modelopt.torch.kernels.triton_fa"
-        )
-        mocks["modelopt.torch.kernels.triton_fa"].attention = fake_attention
-
-        with patch.dict(sys.modules, mocks):
-            self.mod = importlib.import_module(mod_name)
-            self.FakeAttention = mocks["ltx_core.model.transformer.attention"].Attention
-            yield
-        sys.modules.pop(mod_name, None)
-
-    def test_triton_attention_basic(self):
-        """LTX triton attention: [B, T, H*D] -> varlen -> [B, T, H*D]."""
+    def _setup(self):
+        ltx_mocks = _make_mock_ltx_core()
+        _, self.mod = _import_fresh(f"{_KERNELS_PKG}.ltx_triton_attention", ltx_mocks)
+        self.FakeAttention = ltx_mocks["ltx_core.model.transformer.attention"].Attention
+        yield
+        _purge_kernel_modules()
+
+    def test_basic(self):
         b, t, h, d = 2, 8, 4, 16
         q = torch.randn(b, t, h * d)
-        k = torch.randn(b, t, h * d)
-        v = torch.randn(b, t, h * d)
-
-        out = self.mod._ltx_triton_attention(q, k, v, heads=h, threshold=0.1)
+        out = self.mod._ltx_triton_attention(q, q, q, heads=h, threshold=0.1)
         assert out.shape == (b, t, h * d)
 
     def test_set_clear_context(self):
-        """Thread-local context set/clear cycle."""
         self.mod.set_ltx_triton_context(active=True, threshold=0.05)
         active, threshold = self.mod._get_ltx_triton_context()
         assert active is True
         assert threshold == 0.05
-
         self.mod.clear_ltx_triton_context()
         active, threshold = self.mod._get_ltx_triton_context()
         assert active is False
         assert threshold is None
 
-    def test_wrapper_routes_to_triton_when_active(self):
-        """Wrapper calls Triton attention when context is active."""
+    def test_wrapper_routing(self):
         original_fn = MagicMock(return_value=torch.zeros(1, 4, 32))
         wrapper = self.mod._TritonLTXAttentionWrapper(original_fn)
-
         b, t, h, d = 1, 4, 2, 16
         q = torch.randn(b, t, h * d)
-        k = torch.randn(b, t, h * d)
-        v = torch.randn(b, t, h * d)
 
-        # Inactive: calls original
-        out = wrapper(q, k, v, heads=h)
+        wrapper(q, q, q, heads=h)
         original_fn.assert_called_once()
 
-        # Active: calls triton (not original)
         original_fn.reset_mock()
         self.mod.set_ltx_triton_context(active=True, threshold=0.1)
         try:
-            out = wrapper(q, k, v, heads=h)
+            out = wrapper(q, q, q, heads=h)
             original_fn.assert_not_called()
             assert out.shape == (b, t, h * d)
         finally:
             self.mod.clear_ltx_triton_context()
 
-    def test_register_patches_attention_modules(self):
-        """register_ltx_triton_attention patches Attention modules."""
+    def test_register_idempotent(self):
         model = nn.Sequential()
-        attn = self.FakeAttention()
-        model.add_module("attn", attn)
-
+        model.add_module("attn", self.FakeAttention())
         self.mod.register_ltx_triton_attention(model)
-        assert isinstance(attn.attention_function, self.mod._TritonLTXAttentionWrapper)
-
-        # Idempotent
+        assert isinstance(model.attn.attention_function, self.mod._TritonLTXAttentionWrapper)
         self.mod.register_ltx_triton_attention(model)
-        assert isinstance(attn.attention_function, self.mod._TritonLTXAttentionWrapper)
+        assert isinstance(model.attn.attention_function, self.mod._TritonLTXAttentionWrapper)
 
-    def test_threshold_passed_to_kernel(self):
-        """When threshold is set, it appears in kernel kwargs."""
-        captured_kw = {}
-        original_attention = self.mod.attention
-
-        def spy_attention(q, k, v, **kw):
-            captured_kw.update(kw)
-            return q
-
-        self.mod.attention = spy_attention
+    def test_threshold_forwarded(self):
+        captured = {}
+        orig = self.mod.attention
+        self.mod.attention = lambda q, k, v, **kw: (captured.update(kw), q)[1]
         try:
-            b, t, h, d = 1, 4, 2, 8
-            q = torch.randn(b, t, h * d)
-            self.mod._ltx_triton_attention(q, q, q, heads=h, threshold=0.07)
-            assert captured_kw.get("skip_softmax_threshold") == 0.07
+            q = torch.randn(1, 4, 16)
+            self.mod._ltx_triton_attention(q, q, q, heads=2, threshold=0.07)
+            assert captured.get("skip_softmax_threshold") == 0.07
         finally:
-            self.mod.attention = original_attention
+            self.mod.attention = orig
 
 
 # ---------------------------------------------------------------------------
@@ -487,29 +404,21 @@ def spy_attention(q, k, v, **kw):
 
 
 class TestRegisterDiffusersBackends:
-    """Test _register_diffusers_backends_if_needed with mocked imports."""
-
     def test_no_diffusers_no_error(self):
-        """When diffusers is not installed, function completes without error."""
         from modelopt.torch.sparsity.attention_sparsity.conversion import (
             _register_diffusers_backends_if_needed,
         )
 
-        model = nn.Linear(10, 10)
-        # Should not raise even if diffusers is not installed
-        _register_diffusers_backends_if_needed(model)
+        _register_diffusers_backends_if_needed(nn.Linear(10, 10))
 
     def test_with_diffusers_model(self):
-        """When model is a diffusers ModelMixin, backends are registered."""
         from modelopt.torch.sparsity.attention_sparsity.conversion import (
             _register_diffusers_backends_if_needed,
         )
 
-        # Create a fake ModelMixin so isinstance check passes
         mock_mixin = type("ModelMixin", (nn.Module,), {})
         mock_modeling_utils = types.ModuleType("diffusers.models.modeling_utils")
         mock_modeling_utils.ModelMixin = mock_mixin
-
         fake_model = mock_mixin()
 
         with (

From 4c179a3bb8664ad05971f4251f20e778cd96e73a Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Thu, 2 Apr 2026 21:29:49 +0000
Subject: [PATCH 04/21] Fixed the test case

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 .../attention_sparsity/plugins/huggingface.py |   3 +-
 .../test_kernel_backends.py                   | 409 ++++--------------
 2 files changed, 92 insertions(+), 320 deletions(-)

diff --git a/modelopt/torch/sparsity/attention_sparsity/plugins/huggingface.py b/modelopt/torch/sparsity/attention_sparsity/plugins/huggingface.py
index 988d48418b..d26b73f0b4 100644
--- a/modelopt/torch/sparsity/attention_sparsity/plugins/huggingface.py
+++ b/modelopt/torch/sparsity/attention_sparsity/plugins/huggingface.py
@@ -16,7 +16,6 @@
 """Dynamic sparse attention registration for HuggingFace models."""
 
 import torch.nn as nn
-import transformers
 
 from modelopt.torch.opt.dynamic import DynamicModule
 
@@ -112,6 +111,8 @@ def _is_supported_model(model: nn.Module) -> bool:
     """
     # Check for HuggingFace PreTrainedModel
     try:
+        import transformers
+
         if isinstance(model, transformers.PreTrainedModel):
             return True
     except ImportError:
diff --git a/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py b/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
index 22ea4580b1..b8685b8410 100644
--- a/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
+++ b/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
@@ -13,13 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Unit tests for diffusers/LTX kernel backends with mocked dependencies.
+"""Unit tests for diffusers kernel backends and thread-local context."""
 
-These tests verify the attention computation logic and registration without
-requiring diffusers, ltx_core, or a GPU (triton driver).
-"""
-
-import importlib
 import sys
 import types
 from unittest.mock import MagicMock, patch
@@ -28,134 +23,52 @@
 import torch
 import torch.nn as nn
 
-# ---------------------------------------------------------------------------
-# Module names that must be cleaned from sys.modules between tests
-# ---------------------------------------------------------------------------
-_KERNELS_PKG = "modelopt.torch.sparsity.attention_sparsity.kernels"
-_ALL_KERNEL_MODS = [
-    _KERNELS_PKG,
-    f"{_KERNELS_PKG}.diffusers_eager_attention",
-    f"{_KERNELS_PKG}.diffusers_triton_attention",
-    f"{_KERNELS_PKG}.ltx_eager_attention",
-    f"{_KERNELS_PKG}.ltx_triton_attention",
-]
-
-
-def _purge_kernel_modules():
-    """Remove all kernel backend modules from sys.modules."""
-    for name in _ALL_KERNEL_MODS:
-        sys.modules.pop(name, None)
-
-
-# ---------------------------------------------------------------------------
-# Helpers: build mock module dicts
-# ---------------------------------------------------------------------------
-
-
-def _make_base_mocks():
-    """Mocks needed by every test: modelopt.torch.kernels + triton_fa."""
-    mock_kernels = types.ModuleType("modelopt.torch.kernels")
-
-    def fake_attention(q, k, v, **kw):
-        return q
 
-    mock_kernels.IS_AVAILABLE = True
-    mock_kernels.attention = fake_attention
-    mock_kernels.register_triton_attention = None
-
-    mock_triton_fa = types.ModuleType("modelopt.torch.kernels.triton_fa")
-    mock_triton_fa.attention = fake_attention
-
-    return {
-        "modelopt.torch.kernels": mock_kernels,
-        "modelopt.torch.kernels.triton_fa": mock_triton_fa,
-    }
-
-
-def _make_mock_diffusers():
-    """Mock diffusers.models.attention_dispatch."""
-    mock_diffusers = types.ModuleType("diffusers")
-    mock_models = types.ModuleType("diffusers.models")
-    mock_ad = types.ModuleType("diffusers.models.attention_dispatch")
+def _mock_diffusers():
+    """Mock diffusers.models.attention_dispatch for testing without real diffusers."""
+    m = types.ModuleType("diffusers.models.attention_dispatch")
 
     class FakeBackendName(str):
         _member_map_: dict = {}
         _value2member_map_: dict = {}
 
-    mock_ad.AttentionBackendName = FakeBackendName
+    m.AttentionBackendName = FakeBackendName
 
-    class FakeRegistry:
+    class FakeReg:
         _backends: dict = {}
         _constraints: dict = {}
         _supported_arg_names: dict = {}
 
-    mock_ad._AttentionBackendRegistry = FakeRegistry
-    mock_ad.attention_backend = MagicMock()
-
-    mock_diffusers.models = mock_models
-    mock_models.attention_dispatch = mock_ad
+    m._AttentionBackendRegistry = FakeReg
+    m.attention_backend = MagicMock()
     return {
-        "diffusers": mock_diffusers,
-        "diffusers.models": mock_models,
-        "diffusers.models.attention_dispatch": mock_ad,
+        "diffusers": types.ModuleType("diffusers"),
+        "diffusers.models": types.ModuleType("diffusers.models"),
+        "diffusers.models.attention_dispatch": m,
     }
 
 
-def _make_mock_ltx_core():
-    """Mock ltx_core.model.transformer.attention."""
-    mock_ltx = types.ModuleType("ltx_core")
-    mock_model = types.ModuleType("ltx_core.model")
-    mock_tf = types.ModuleType("ltx_core.model.transformer")
-    mock_attn = types.ModuleType("ltx_core.model.transformer.attention")
-
-    class FakeAttention(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.attention_function = lambda q, k, v, heads, mask=None: q
-
-    mock_attn.Attention = FakeAttention
-    mock_ltx.model = mock_model
-    mock_model.transformer = mock_tf
-    mock_tf.attention = mock_attn
-    return {
-        "ltx_core": mock_ltx,
-        "ltx_core.model": mock_model,
-        "ltx_core.model.transformer": mock_tf,
-        "ltx_core.model.transformer.attention": mock_attn,
-    }
-
-
-def _import_fresh(mod_name: str, extra_mocks: dict):
-    """Purge kernel modules, patch sys.modules, and reimport ``mod_name``."""
-    _purge_kernel_modules()
-    mocks = {**_make_base_mocks(), **extra_mocks}
-    with patch.dict(sys.modules, mocks):
-        # Reimport the parent package first so submodule imports resolve
-        kernels_pkg = importlib.import_module(_KERNELS_PKG)
-        mod = importlib.import_module(mod_name)
-    return kernels_pkg, mod
-
-
 # ---------------------------------------------------------------------------
-# Tests: kernels/__init__.py thread-local context
+# Tests: thread-local skip-softmax context
 # ---------------------------------------------------------------------------
 
 
 class TestSkipSoftmaxContext:
-    @pytest.fixture(autouse=True)
-    def _setup(self):
-        self.kernels, _ = _import_fresh(_KERNELS_PKG, {})
-        yield
-        _purge_kernel_modules()
-
     def test_default_is_false(self):
-        assert self.kernels.get_skip_softmax_context() is False
+        from modelopt.torch.sparsity.attention_sparsity.kernels import get_skip_softmax_context
+
+        assert get_skip_softmax_context() is False
 
     def test_set_and_get(self):
-        self.kernels.set_skip_softmax_context(True)
-        assert self.kernels.get_skip_softmax_context() is True
-        self.kernels.set_skip_softmax_context(False)
-        assert self.kernels.get_skip_softmax_context() is False
+        from modelopt.torch.sparsity.attention_sparsity.kernels import (
+            get_skip_softmax_context,
+            set_skip_softmax_context,
+        )
+
+        set_skip_softmax_context(True)
+        assert get_skip_softmax_context() is True
+        set_skip_softmax_context(False)
+        assert get_skip_softmax_context() is False
 
 
 # ---------------------------------------------------------------------------
@@ -166,56 +79,47 @@ def test_set_and_get(self):
 class TestDiffusersEagerAttention:
     @pytest.fixture(autouse=True)
     def _setup(self):
-        _, self.mod = _import_fresh(
-            f"{_KERNELS_PKG}.diffusers_eager_attention", _make_mock_diffusers()
-        )
-        yield
-        _purge_kernel_modules()
+        with patch.dict(sys.modules, _mock_diffusers()):
+            from modelopt.torch.sparsity.attention_sparsity.kernels.diffusers_eager_attention import (
+                _diffusers_eager_attention,
+                get_skip_softmax_attention_backend,
+                register_diffusers_eager_attention,
+            )
+
+            self._fn = _diffusers_eager_attention
+            self._register = register_diffusers_eager_attention
+            self._get_backend = get_skip_softmax_attention_backend
 
-    def test_basic(self):
-        b, s, h, d = 2, 8, 4, 16
-        q = torch.randn(b, s, h, d)
-        out = self.mod._diffusers_eager_attention(q, q, q)
-        assert out.shape == (b, s, h, d)
+            import modelopt.torch.sparsity.attention_sparsity.kernels.diffusers_eager_attention as mod
+
+            mod._BACKEND_REGISTERED = False
+            yield
+
+    def test_basic_shape(self):
+        q = torch.randn(2, 8, 4, 16)
+        assert self._fn(q, q, q).shape == (2, 8, 4, 16)
 
     def test_cross_attention(self):
-        b, sq, sk, h, d = 1, 4, 12, 2, 8
-        q = torch.randn(b, sq, h, d)
-        k = torch.randn(b, sk, h, d)
-        v = torch.randn(b, sk, h, d)
-        out = self.mod._diffusers_eager_attention(q, k, v)
-        assert out.shape == (b, sq, h, d)
-
-    def test_causal_mask(self):
-        b, s, h, d = 1, 4, 1, 8
-        q = torch.randn(b, s, h, d)
-        v = torch.eye(s).unsqueeze(0).unsqueeze(2).expand(b, s, h, s)
-        out = self.mod._diffusers_eager_attention(q, q, v, is_causal=True)
-        assert out.shape == (b, s, h, s)
-
-    def test_attn_mask(self):
-        b, s, h, d = 1, 4, 2, 8
-        q = torch.randn(b, s, h, d)
-        mask = torch.zeros(b, 1, s, s)
-        out = self.mod._diffusers_eager_attention(q, q, q, attn_mask=mask)
-        assert out.shape == (b, s, h, d)
+        q = torch.randn(1, 4, 2, 8)
+        k = torch.randn(1, 12, 2, 8)
+        assert self._fn(q, k, k).shape == (1, 4, 2, 8)
+
+    def test_causal(self):
+        q = torch.randn(1, 4, 1, 8)
+        assert self._fn(q, q, q, is_causal=True).shape == (1, 4, 1, 8)
 
     def test_gqa(self):
-        b, s, hq, hkv, d = 1, 4, 8, 2, 16
-        q = torch.randn(b, s, hq, d)
-        k = torch.randn(b, s, hkv, d)
-        v = torch.randn(b, s, hkv, d)
-        out = self.mod._diffusers_eager_attention(q, k, v, enable_gqa=True)
-        assert out.shape == (b, s, hq, d)
+        q = torch.randn(1, 4, 8, 16)
+        k = torch.randn(1, 4, 2, 16)
+        assert self._fn(q, k, k, enable_gqa=True).shape == (1, 4, 8, 16)
 
     def test_register_idempotent(self):
-        self.mod.register_diffusers_eager_attention()
-        self.mod.register_diffusers_eager_attention()
+        self._register()
+        self._register()
 
     def test_get_backend_before_register_raises(self):
-        self.mod._BACKEND_REGISTERED = False
         with pytest.raises(RuntimeError, match="not registered"):
-            self.mod.get_skip_softmax_attention_backend()
+            self._get_backend()
 
 
 # ---------------------------------------------------------------------------
@@ -226,176 +130,44 @@ def test_get_backend_before_register_raises(self):
 class TestDiffusersTritonAttention:
     @pytest.fixture(autouse=True)
     def _setup(self):
-        _, self.mod = _import_fresh(
-            f"{_KERNELS_PKG}.diffusers_triton_attention", _make_mock_diffusers()
-        )
-        yield
-        _purge_kernel_modules()
-
-    def test_basic(self):
-        b, s, h, d = 2, 8, 4, 16
-        q = torch.randn(b, s, h, d)
-        out = self.mod._diffusers_triton_attention(q, q, q)
-        assert out.shape == (b, s, h, d)
-
-    def test_cross_attention(self):
-        b, sq, sk, h, d = 1, 4, 12, 2, 8
-        q = torch.randn(b, sq, h, d)
-        k = torch.randn(b, sk, h, d)
-        v = torch.randn(b, sk, h, d)
-        out = self.mod._diffusers_triton_attention(q, k, v)
-        assert out.shape == (b, sq, h, d)
+        mocks = _mock_diffusers()
+        mk = types.ModuleType("modelopt.torch.kernels")
+        mk.attention = lambda q, k, v, **kw: q
+        mk.IS_AVAILABLE = True
+        mk.register_triton_attention = None
+        mocks["modelopt.torch.kernels"] = mk
+
+        with patch.dict(sys.modules, mocks):
+            from modelopt.torch.sparsity.attention_sparsity.kernels.diffusers_triton_attention import (
+                _diffusers_triton_attention,
+                clear_triton_skip_softmax_config,
+                get_triton_attention_backend,
+                register_diffusers_triton_attention,
+                set_triton_skip_softmax_config,
+            )
+
+            self._fn = _diffusers_triton_attention
+            self._set = set_triton_skip_softmax_config
+            self._clear = clear_triton_skip_softmax_config
+            self._register = register_diffusers_triton_attention
+            self._get_backend = get_triton_attention_backend
+
+            import modelopt.torch.sparsity.attention_sparsity.kernels.diffusers_triton_attention as mod
+
+            mod._BACKEND_REGISTERED = False
+            yield
 
     def test_set_clear_config(self):
-        self.mod.set_triton_skip_softmax_config(threshold=0.1)
-        assert self.mod._thread_local.skip_threshold == 0.1
-        self.mod.clear_triton_skip_softmax_config()
-        assert self.mod._thread_local.skip_threshold is None
-
-    def test_threshold_forwarded(self):
-        captured = {}
-        orig = self.mod.attention
-        self.mod.attention = lambda q, k, v, **kw: (captured.update(kw), q)[1]
-        try:
-            self.mod.set_triton_skip_softmax_config(threshold=0.05)
-            q = torch.randn(1, 4, 2, 8)
-            self.mod._diffusers_triton_attention(q, q, q)
-            assert captured.get("skip_softmax_threshold") == 0.05
-        finally:
-            self.mod.attention = orig
-            self.mod.clear_triton_skip_softmax_config()
+        self._set(threshold=0.1)
+        self._clear()
 
     def test_register_idempotent(self):
-        self.mod.register_diffusers_triton_attention()
-        self.mod.register_diffusers_triton_attention()
+        self._register()
+        self._register()
 
     def test_get_backend_before_register_raises(self):
-        self.mod._BACKEND_REGISTERED = False
         with pytest.raises(RuntimeError, match="not registered"):
-            self.mod.get_triton_attention_backend()
-
-
-# ---------------------------------------------------------------------------
-# Tests: LTX eager attention
-# ---------------------------------------------------------------------------
-
-
-class TestLTXEagerAttention:
-    @pytest.fixture(autouse=True)
-    def _setup(self):
-        ltx_mocks = _make_mock_ltx_core()
-        self.kernels, self.mod = _import_fresh(f"{_KERNELS_PKG}.ltx_eager_attention", ltx_mocks)
-        self.FakeAttention = ltx_mocks["ltx_core.model.transformer.attention"].Attention
-        yield
-        _purge_kernel_modules()
-
-    def test_basic(self):
-        b, t, h, d = 2, 8, 4, 16
-        q = torch.randn(b, t, h * d)
-        out = self.mod._ltx_eager_attention(q, q, q, heads=h)
-        assert out.shape == (b, t, h * d)
-
-    def test_masks(self):
-        b, t, h, d = 1, 4, 2, 8
-        q = torch.randn(b, t, h * d)
-        out = self.mod._ltx_eager_attention(q, q, q, heads=h, mask=torch.zeros(t, t))
-        assert out.shape == (b, t, h * d)
-        out = self.mod._ltx_eager_attention(q, q, q, heads=h, mask=torch.zeros(b, t, t))
-        assert out.shape == (b, t, h * d)
-
-    def test_wrapper_routing(self):
-        original_fn = MagicMock(return_value=torch.zeros(1, 4, 32))
-        wrapper = self.mod._SkipSoftmaxLTXAttentionWrapper(original_fn)
-        b, t, h, d = 1, 4, 2, 16
-        q = torch.randn(b, t, h * d)
-
-        wrapper(q, q, q, heads=h)
-        original_fn.assert_called_once()
-
-        original_fn.reset_mock()
-        self.kernels.set_skip_softmax_context(True)
-        try:
-            out = wrapper(q, q, q, heads=h)
-            original_fn.assert_not_called()
-            assert out.shape == (b, t, h * d)
-        finally:
-            self.kernels.set_skip_softmax_context(False)
-
-    def test_register_idempotent(self):
-        model = nn.Sequential()
-        model.add_module("attn", self.FakeAttention())
-        self.mod.register_ltx_eager_attention(model)
-        assert isinstance(model.attn.attention_function, self.mod._SkipSoftmaxLTXAttentionWrapper)
-        self.mod.register_ltx_eager_attention(model)
-        assert isinstance(model.attn.attention_function, self.mod._SkipSoftmaxLTXAttentionWrapper)
-
-
-# ---------------------------------------------------------------------------
-# Tests: LTX triton attention
-# ---------------------------------------------------------------------------
-
-
-class TestLTXTritonAttention:
-    @pytest.fixture(autouse=True)
-    def _setup(self):
-        ltx_mocks = _make_mock_ltx_core()
-        _, self.mod = _import_fresh(f"{_KERNELS_PKG}.ltx_triton_attention", ltx_mocks)
-        self.FakeAttention = ltx_mocks["ltx_core.model.transformer.attention"].Attention
-        yield
-        _purge_kernel_modules()
-
-    def test_basic(self):
-        b, t, h, d = 2, 8, 4, 16
-        q = torch.randn(b, t, h * d)
-        out = self.mod._ltx_triton_attention(q, q, q, heads=h, threshold=0.1)
-        assert out.shape == (b, t, h * d)
-
-    def test_set_clear_context(self):
-        self.mod.set_ltx_triton_context(active=True, threshold=0.05)
-        active, threshold = self.mod._get_ltx_triton_context()
-        assert active is True
-        assert threshold == 0.05
-        self.mod.clear_ltx_triton_context()
-        active, threshold = self.mod._get_ltx_triton_context()
-        assert active is False
-        assert threshold is None
-
-    def test_wrapper_routing(self):
-        original_fn = MagicMock(return_value=torch.zeros(1, 4, 32))
-        wrapper = self.mod._TritonLTXAttentionWrapper(original_fn)
-        b, t, h, d = 1, 4, 2, 16
-        q = torch.randn(b, t, h * d)
-
-        wrapper(q, q, q, heads=h)
-        original_fn.assert_called_once()
-
-        original_fn.reset_mock()
-        self.mod.set_ltx_triton_context(active=True, threshold=0.1)
-        try:
-            out = wrapper(q, q, q, heads=h)
-            original_fn.assert_not_called()
-            assert out.shape == (b, t, h * d)
-        finally:
-            self.mod.clear_ltx_triton_context()
-
-    def test_register_idempotent(self):
-        model = nn.Sequential()
-        model.add_module("attn", self.FakeAttention())
-        self.mod.register_ltx_triton_attention(model)
-        assert isinstance(model.attn.attention_function, self.mod._TritonLTXAttentionWrapper)
-        self.mod.register_ltx_triton_attention(model)
-        assert isinstance(model.attn.attention_function, self.mod._TritonLTXAttentionWrapper)
-
-    def test_threshold_forwarded(self):
-        captured = {}
-        orig = self.mod.attention
-        self.mod.attention = lambda q, k, v, **kw: (captured.update(kw), q)[1]
-        try:
-            q = torch.randn(1, 4, 16)
-            self.mod._ltx_triton_attention(q, q, q, heads=2, threshold=0.07)
-            assert captured.get("skip_softmax_threshold") == 0.07
-        finally:
-            self.mod.attention = orig
+            self._get_backend()
 
 
 # ---------------------------------------------------------------------------
@@ -417,12 +189,11 @@ def test_with_diffusers_model(self):
         )
 
         mock_mixin = type("ModelMixin", (nn.Module,), {})
-        mock_modeling_utils = types.ModuleType("diffusers.models.modeling_utils")
-        mock_modeling_utils.ModelMixin = mock_mixin
-        fake_model = mock_mixin()
+        mock_utils = types.ModuleType("diffusers.models.modeling_utils")
+        mock_utils.ModelMixin = mock_mixin
 
         with (
-            patch.dict(sys.modules, {"diffusers.models.modeling_utils": mock_modeling_utils}),
+            patch.dict(sys.modules, {"diffusers.models.modeling_utils": mock_utils}),
             patch(
                 "modelopt.torch.sparsity.attention_sparsity.kernels.register_diffusers_eager_attention",
                 MagicMock(),
@@ -432,6 +203,6 @@ def test_with_diffusers_model(self):
                 MagicMock(),
             ) as mock_triton,
         ):
-            _register_diffusers_backends_if_needed(fake_model)
+            _register_diffusers_backends_if_needed(mock_mixin())
             mock_eager.assert_called_once()
             mock_triton.assert_called_once()

From 8702b7ba9d1dd0e147bb6a10dd44351b23e6a2c7 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Mon, 6 Apr 2026 17:20:48 +0000
Subject: [PATCH 05/21] Removed the token import

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 .../sparsity/attention_sparsity/calibration/calibrate.py     | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py
index da64e87d64..3215a7530b 100644
--- a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py
+++ b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py
@@ -21,7 +21,6 @@
 
 import torch
 import torch.nn as nn
-from transformers import AutoTokenizer
 
 from modelopt.torch.utils import get_module_device
 
@@ -32,8 +31,10 @@
 from .ruler_dataset import RulerDatasetBuilder
 
 
-def _load_tokenizer(tokenizer_name_or_path: str) -> "AutoTokenizer":
+def _load_tokenizer(tokenizer_name_or_path: str):
     """Load tokenizer and ensure pad_token is set."""
+    from transformers import AutoTokenizer
+
     tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
     if not tokenizer.pad_token:
         tokenizer.pad_token = tokenizer.eos_token

From 70099a5abbd942a857fbadc98cf520e18a24a798 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Mon, 6 Apr 2026 20:26:23 +0000
Subject: [PATCH 06/21] removed the unused code

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 .../sparsity/attention_sparsity/conversion.py | 48 +++++--------------
 1 file changed, 12 insertions(+), 36 deletions(-)

diff --git a/modelopt/torch/sparsity/attention_sparsity/conversion.py b/modelopt/torch/sparsity/attention_sparsity/conversion.py
index e0aa2fad2b..3a1d52fbd4 100644
--- a/modelopt/torch/sparsity/attention_sparsity/conversion.py
+++ b/modelopt/torch/sparsity/attention_sparsity/conversion.py
@@ -402,46 +402,32 @@ def export_sparse_attention_config(model: nn.Module) -> dict[str, Any] | None:
     if calibration_params is None:
         return None
 
-    # Detect calibration type from params
-    sample_params = next(iter(calibration_params.values()))
-    is_percentile = "threshold" in sample_params
+    # Build threshold_scale_factor with model parameters
+    threshold_scale_factor: dict[str, Any] = {
+        "formula": "a * exp(b * target_sparsity)",
+    }
+    for phase in ["prefill", "decode"]:
+        if phase in calibration_params:
+            threshold_scale_factor[phase] = {
+                "a": calibration_params[phase]["a"],
+                "b": calibration_params[phase]["b"],
+            }
 
     # Build the export config
     export_config: dict[str, Any] = {
         "config_groups": {
             "group_0": {
-                "sparse_algo": "softmax_skip_diffusion" if is_percentile else "softmax_skip",
+                "sparse_algo": "softmax_skip",
                 "targets": sorted(target_classes) if target_classes else ["Attention"],
             }
         },
+        "threshold_scale_factor": threshold_scale_factor,
         "producer": {
             "name": "modelopt",
             "version": mo_version,
         },
     }
 
-    if is_percentile:
-        threshold_config: dict[str, Any] = {
-            "formula": "skip if gap >= threshold * log(seq_k)",
-        }
-        for phase in ["prefill", "decode"]:
-            if phase in calibration_params:
-                threshold_config[phase] = {
-                    "threshold": calibration_params[phase]["threshold"],
-                }
-        export_config["threshold_config"] = threshold_config
-    else:
-        threshold_scale_factor: dict[str, Any] = {
-            "formula": "a * exp(b * target_sparsity)",
-        }
-        for phase in ["prefill", "decode"]:
-            if phase in calibration_params:
-                threshold_scale_factor[phase] = {
-                    "a": calibration_params[phase]["a"],
-                    "b": calibration_params[phase]["b"],
-                }
-        export_config["threshold_scale_factor"] = threshold_scale_factor
-
     return export_config
 
 
@@ -513,16 +499,6 @@ def _format_threshold(info: dict) -> str:
                 s = target.get(phase, 0.5)
                 parts.append(f"{phase}: a={a:.4f}, b={b:.2f}, target={s:.0%}")
         return f"calibrated({', '.join(parts)})"
-    if t == "dynamic_calibrated_percentile":
-        params = info.get("calibration_params", {})
-        target = info.get("target_sparse_ratio", {})
-        parts = []
-        for phase in ["prefill", "decode"]:
-            if phase in params and "threshold" in params[phase]:
-                th = params[phase]["threshold"]
-                s = target.get(phase, 0.2)
-                parts.append(f"{phase}: threshold={th:.4f}, target={s:.0%}")
-        return f"percentile({', '.join(parts)})"
     if t == "static":
         v = info.get("value")
         if isinstance(v, dict):

From 6cc96a40eb4381324af5aa291a3d54b7690e1233 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Mon, 6 Apr 2026 20:34:45 +0000
Subject: [PATCH 07/21] Update the README

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 examples/diffusers/README.md                  | 61 +++++++++++++++++++
 .../diffusers/sparsity/wan22_skip_softmax.py  |  4 +-
 2 files changed, 63 insertions(+), 2 deletions(-)

diff --git a/examples/diffusers/README.md b/examples/diffusers/README.md
index 6af226752d..d013493f89 100644
--- a/examples/diffusers/README.md
+++ b/examples/diffusers/README.md
@@ -13,6 +13,7 @@ Cache Diffusion is a technique that reuses cached outputs from previous diffusio
 | Pre-Requisites | Required & optional packages to use this technique | \[[Link](#pre-requisites)\] | |
 | Getting Started | Learn how to optimize your models using quantization/cache diffusion to reduce precision and improve inference efficiency | \[[Link](#getting-started)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] |
 | Support Matrix | View the support matrix to see quantization/cahce diffusion compatibility and feature availability across different models | \[[Link](#support-matrix)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] |
+| Sparse Attention (Skip-Softmax) | Skip-softmax sparse attention for diffusion models | \[[Link](#sparse-attention-skip-softmax)\] | |
 | Cache Diffusion | Caching technique to accelerate inference without compromising quality | \[[Link](#cache-diffusion)\] | |
 | Post Training Quantization (PTQ) | Example scripts on how to run PTQ on diffusion models | \[[Link](#post-training-quantization-ptq)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] |
 | Quantization Aware Training (QAT) | Example scripts on how to run QAT on diffusion models | \[[Link](#quantization-aware-training-qat)\] | \[[docs](https://nvidia.github.io/Model-Optimizer/guides/1_quantization.html)\] |
@@ -276,6 +277,66 @@ mto.restore(pipe.unet, your_quantized_ckpt)
 
 By following these steps, your PEFT LoRA model should be efficiently quantized using ModelOpt, ready for deployment while maximizing performance.
 
+## Sparse Attention (Skip-Softmax)
+
+Skip-softmax sparse attention skips KV tiles whose attention scores are negligible during the softmax computation, reducing FLOPs without retraining. An exponential model (`scale_factor = a * exp(b * target_sparsity)`) is calibrated once, then the target sparsity can be adjusted at runtime without recalibration.
+
+### Getting Started
+
+```python
+import modelopt.torch.sparsity.attention_sparsity as mtsa
+
+# 1. Define config with calibration
+config = {
+    "sparse_cfg": {
+        "calibration": {
+            "target_sparse_ratio": {"prefill": 0.25},
+            "threshold_trials": [1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3,
+                                 1e-2, 2e-2, 5e-2, 1e-1, 2e-1, 3e-1, 5e-1, 7e-1],
+        },
+        "*.attn1": {
+            "method": "flash_skip_softmax",
+            "thresholds": {"prefill": [1e-3]},
+            "br": 128, "bc": 128,
+            "backend": "pytorch",
+            "is_causal": False,
+            "collect_stats": True,
+            "enable": True,
+        },
+        "*.attn2": {"enable": False},
+        "default": {"enable": False},
+    },
+}
+
+# 2. Provide a calibration forward loop
+def forward_loop(model):
+    pipeline(prompt="a cat", num_frames=81, num_inference_steps=40, ...)
+
+# 3. Sparsify + calibrate
+mtsa.sparsify(transformer, config, forward_loop=forward_loop)
+
+# 4. Generate as usual — sparsity is applied automatically
+output = pipeline(prompt="a dog on the beach", ...)
+```
+
+### Example Scripts
+
+#### LTX-2 [Script](./sparsity/ltx2_skip_softmax.py)
+
+```bash
+python sparsity/ltx2_skip_softmax.py \
+    --prompt "A cat playing piano" --output out.mp4 \
+    --calibrate --target-sparsity 0.25 --skip-first-last 3
+```
+
+#### Wan 2.2 [Script](./sparsity/wan22_skip_softmax.py)
+
+```bash
+python sparsity/wan22_skip_softmax.py \
+    --prompt "A sunset over mountains" --output out.mp4 \
+    --calibrate --target-sparsity 0.25 --skip-first-last 2
+```
+
 ## Cache Diffusion
 
 Cache Diffusion methods, such as [DeepCache](https://arxiv.org/abs/2312.00858), [Block Caching](https://arxiv.org/abs/2312.03209) and [T-Gate](https://arxiv.org/abs/2404.02747), optimize performance by reusing cached outputs from previous steps instead of recalculating them. This **training-free** caching approach is compatible with a variety of models, like **DiT** and **UNet**, enabling considerable acceleration without compromising quality.
diff --git a/examples/diffusers/sparsity/wan22_skip_softmax.py b/examples/diffusers/sparsity/wan22_skip_softmax.py
index ac2031a6d7..cb30321e3c 100644
--- a/examples/diffusers/sparsity/wan22_skip_softmax.py
+++ b/examples/diffusers/sparsity/wan22_skip_softmax.py
@@ -96,8 +96,8 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--skip-first-last",
         type=int,
-        default=0,
-        help="Number of first/last transformer layers to keep dense (default: 0)",
+        default=2,
+        help="Number of first/last transformer layers to keep dense (default: 2)",
     )
 
     # Calibration options

From 4de0d3baa49fd254194767ff6cb8017e8b27aff9 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Tue, 7 Apr 2026 02:33:07 +0000
Subject: [PATCH 08/21] Updated the example script

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 examples/diffusers/README.md                  |  25 +-
 .../diffusers/sparsity/ltx2_skip_softmax.py   | 397 ------------------
 .../diffusers/sparsity/wan22_skip_softmax.py  | 104 +++--
 3 files changed, 89 insertions(+), 437 deletions(-)
 delete mode 100644 examples/diffusers/sparsity/ltx2_skip_softmax.py

diff --git a/examples/diffusers/README.md b/examples/diffusers/README.md
index d013493f89..551fd6ec9f 100644
--- a/examples/diffusers/README.md
+++ b/examples/diffusers/README.md
@@ -290,9 +290,10 @@ import modelopt.torch.sparsity.attention_sparsity as mtsa
 config = {
     "sparse_cfg": {
         "calibration": {
-            "target_sparse_ratio": {"prefill": 0.25},
+            "target_sparse_ratio": {"prefill": 0.5},
             "threshold_trials": [1e-6, 5e-6, 1e-5, 5e-5, 1e-4, 5e-4, 1e-3, 5e-3,
-                                 1e-2, 2e-2, 5e-2, 1e-1, 2e-1, 3e-1, 5e-1, 7e-1],
+                                 1e-2, 2e-2, 5e-2, 1e-1, 2e-1, 3e-1, 5e-1, 7e-1,
+                                 8e-1, 9e-1, 9.9e-1],
         },
         "*.attn1": {
             "method": "flash_skip_softmax",
@@ -321,20 +322,22 @@ output = pipeline(prompt="a dog on the beach", ...)
 
 ### Example Scripts
 
-#### LTX-2 [Script](./sparsity/ltx2_skip_softmax.py)
-
-```bash
-python sparsity/ltx2_skip_softmax.py \
-    --prompt "A cat playing piano" --output out.mp4 \
-    --calibrate --target-sparsity 0.25 --skip-first-last 3
-```
-
 #### Wan 2.2 [Script](./sparsity/wan22_skip_softmax.py)
 
+The 14B model automatically sparsifies both `transformer` and `transformer_2`.
+
 ```bash
+# 5B model (4 calibration prompts from OpenVid-1M, 151 frames, 40 steps)
+python sparsity/wan22_skip_softmax.py \
+    --model-path Wan-AI/Wan2.2-TI2V-5B-Diffusers \
+    --prompt "A sunset over mountains" --output out.mp4 \
+    --calibrate --target-sparsity 0.5 --calib-size 4
+
+# 14B model (both transformers sparsified)
 python sparsity/wan22_skip_softmax.py \
+    --model-path Wan-AI/Wan2.2-T2V-A14B-Diffusers \
     --prompt "A sunset over mountains" --output out.mp4 \
-    --calibrate --target-sparsity 0.25 --skip-first-last 2
+    --calibrate --target-sparsity 0.5 --calib-size 4
 ```
 
 ## Cache Diffusion
diff --git a/examples/diffusers/sparsity/ltx2_skip_softmax.py b/examples/diffusers/sparsity/ltx2_skip_softmax.py
deleted file mode 100644
index dae064e070..0000000000
--- a/examples/diffusers/sparsity/ltx2_skip_softmax.py
+++ /dev/null
@@ -1,397 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""LTX-2 inference with skip-softmax sparse attention.
-
-This example applies skip-softmax sparse attention to the LTX-2 video
-generation model using exponential model calibration
-(``scale_factor = a * exp(b * target_sparsity)``).
-
-During calibration, ``flash_skip_softmax`` with the eager attention backend
-collects sparsity statistics across multiple threshold trials. The fitted
-exponential model then allows runtime control of the target sparsity ratio
-without recalibration.
-
-Only the stage-1 backbone is sparsified.  Stage 2 (spatial upsampler +
-distilled LoRA) runs unmodified.
-
-Usage::
-
-    # With calibration (recommended)
-    python ltx2_skip_softmax.py --prompt "A cat playing piano" --output out.mp4 \\
-        --calibrate --target-sparsity 0.25
-
-    # Disable sparsity on first/last 2 layers (higher quality, less speedup)
-    python ltx2_skip_softmax.py --prompt "A cat playing piano" --output out.mp4 \\
-        --calibrate --target-sparsity 0.25 --skip-first-last 2
-"""
-
-import argparse
-import functools
-import os
-
-import torch
-from ltx_core.loader import LTXV_LORA_COMFY_RENAMING_MAP, LoraPathStrengthAndSDOps
-from ltx_core.model.video_vae import TilingConfig, get_video_chunks_number
-from ltx_pipelines.ti2vid_two_stages import TI2VidTwoStagesPipeline
-from ltx_pipelines.utils.constants import (
-    AUDIO_SAMPLE_RATE,
-    DEFAULT_2_STAGE_HEIGHT,
-    DEFAULT_2_STAGE_WIDTH,
-    DEFAULT_AUDIO_GUIDER_PARAMS,
-    DEFAULT_FRAME_RATE,
-    DEFAULT_NEGATIVE_PROMPT,
-    DEFAULT_NUM_INFERENCE_STEPS,
-    DEFAULT_SEED,
-    DEFAULT_VIDEO_GUIDER_PARAMS,
-)
-from ltx_pipelines.utils.media_io import encode_video
-
-import modelopt.torch.sparsity.attention_sparsity as mtsa
-from modelopt.torch.sparsity.attention_sparsity.sparse_attention import SparseAttentionModule
-
-# ---- Model paths (edit these or override via environment variables) ----
-CHECKPOINT_PATH = os.environ.get(
-    "LTX2_CHECKPOINT",
-    "/home/scratch.omniml_data_2/jingyux/models/LTX-2/ltx-2-19b-dev.safetensors",
-)
-DISTILLED_LORA_PATH = os.environ.get(
-    "LTX2_DISTILLED_LORA",
-    "/home/scratch.omniml_data_2/jingyux/models/LTX-2/ltx-2-19b-distilled-lora-384.safetensors",
-)
-SPATIAL_UPSAMPLER_PATH = os.environ.get(
-    "LTX2_SPATIAL_UPSAMPLER",
-    "/home/scratch.omniml_data_2/jingyux/models/LTX-2/ltx-2-spatial-upscaler-x2-1.0.safetensors",
-)
-GEMMA_ROOT = os.environ.get(
-    "LTX2_GEMMA_ROOT",
-    "/home/scratch.omniml_data_2/jingyux/models/LTX-2/gemma-3-12b-it-qat-q4_0-unquantized",
-)
-
-DEFAULT_NUM_FRAMES = 121
-NUM_TRANSFORMER_BLOCKS = 48
-
-# Default threshold trials for calibration
-DEFAULT_THRESHOLD_TRIALS = [
-    1e-6,
-    5e-6,
-    1e-5,
-    5e-5,
-    1e-4,
-    5e-4,
-    1e-3,
-    5e-3,
-    1e-2,
-    2e-2,
-    5e-2,
-    1e-1,
-    2e-1,
-    3e-1,
-    5e-1,
-    7e-1,
-]
-
-
-def parse_args() -> argparse.Namespace:
-    parser = argparse.ArgumentParser(
-        description="LTX-2 video generation with skip-softmax sparse attention"
-    )
-    parser.add_argument("--prompt", type=str, default=None, help="Text prompt for generation")
-    parser.add_argument(
-        "--prompt-dir",
-        type=str,
-        default=None,
-        help="Directory of .txt prompt files (one prompt per file). Overrides --prompt.",
-    )
-    parser.add_argument("--output", type=str, default="output.mp4", help="Output video path")
-    parser.add_argument(
-        "--output-dir",
-        type=str,
-        default=None,
-        help="Directory to save videos when using --prompt-dir",
-    )
-    parser.add_argument(
-        "--num-frames", type=int, default=DEFAULT_NUM_FRAMES, help="Number of frames"
-    )
-    parser.add_argument("--height", type=int, default=DEFAULT_2_STAGE_HEIGHT, help="Video height")
-    parser.add_argument("--width", type=int, default=DEFAULT_2_STAGE_WIDTH, help="Video width")
-    parser.add_argument("--seed", type=int, default=DEFAULT_SEED, help="Random seed")
-
-    # Sparse attention options
-    parser.add_argument(
-        "--skip-first-last",
-        type=int,
-        default=0,
-        help="Number of first/last transformer layers to keep dense (default: 0)",
-    )
-
-    # Calibration options
-    parser.add_argument(
-        "--calibrate",
-        action="store_true",
-        help="Calibrate threshold via exponential model (recommended)",
-    )
-    parser.add_argument(
-        "--target-sparsity",
-        type=float,
-        default=0.25,
-        help="Target sparsity ratio for calibration (0.0-1.0)",
-    )
-    parser.add_argument(
-        "--calib-steps",
-        type=int,
-        default=10,
-        help="Inference steps per calibration sample",
-    )
-    parser.add_argument(
-        "--calib-frames",
-        type=int,
-        default=81,
-        help="Number of frames per calibration sample",
-    )
-    parser.add_argument(
-        "--calib-size",
-        type=int,
-        default=1,
-        help="Number of prompts to use for calibration",
-    )
-    return parser.parse_args()
-
-
-def _patch_vae_requires_grad(pipeline: TI2VidTwoStagesPipeline):
-    """Ensure VAE decoder weights have requires_grad=False to avoid autograd issues."""
-    for ledger_attr in ("stage_1_model_ledger", "stage_2_model_ledger"):
-        ledger = getattr(pipeline, ledger_attr, None)
-        if ledger is None:
-            continue
-        for loader_name in ("video_decoder", "audio_decoder"):
-            orig_loader = getattr(ledger, loader_name, None)
-            if orig_loader is None:
-                continue
-
-            def _make_patched(fn):
-                @functools.wraps(fn)
-                def patched():
-                    model = fn()
-                    model.requires_grad_(False)
-                    return model
-
-                return patched
-
-            setattr(ledger, loader_name, _make_patched(orig_loader))
-
-
-def build_pipeline() -> TI2VidTwoStagesPipeline:
-    """Build the LTX-2 two-stage video generation pipeline."""
-    pipeline = TI2VidTwoStagesPipeline(
-        checkpoint_path=CHECKPOINT_PATH,
-        distilled_lora=[
-            LoraPathStrengthAndSDOps(DISTILLED_LORA_PATH, 0.8, LTXV_LORA_COMFY_RENAMING_MAP)
-        ],
-        spatial_upsampler_path=SPATIAL_UPSAMPLER_PATH,
-        gemma_root=GEMMA_ROOT,
-        loras=[],
-    )
-    _patch_vae_requires_grad(pipeline)
-    return pipeline
-
-
-def build_sparse_config(args: argparse.Namespace) -> dict:
-    """Build sparse attention config from CLI args.
-
-    Uses flash_skip_softmax which supports both calibration (eager attention
-    with F.softmax patching) and inference. Calibration fits an exponential
-    model: scale_factor = a * exp(b * sparsity).
-    """
-    attn_cfg: dict = {
-        "method": "flash_skip_softmax",
-        "thresholds": {"prefill": [1e-3]},
-        "br": 128,
-        "bc": 128,
-        "backend": "pytorch",
-        "is_causal": False,  # Diffusion = bidirectional attention
-        "collect_stats": True,
-        "enable": True,
-    }
-
-    sparse_cfg: dict = {
-        "*.attn1": attn_cfg,  # Self-attention only
-        # Disable on all cross-attention and cross-modal attention
-        "*.attn2": {"enable": False},
-        "*audio_attn1*": {"enable": False},
-        "*audio_attn2*": {"enable": False},
-        "*audio_to_video_attn*": {"enable": False},
-        "*video_to_audio_attn*": {"enable": False},
-        "default": {"enable": False},
-    }
-
-    # Keep first/last N layers dense for quality
-    for i in range(args.skip_first_last):
-        sparse_cfg[f"*transformer_blocks.{i}.attn*"] = {"enable": False}
-        sparse_cfg[f"*transformer_blocks.{NUM_TRANSFORMER_BLOCKS - 1 - i}.attn*"] = {
-            "enable": False
-        }
-
-    config: dict = {"sparse_cfg": sparse_cfg}
-
-    # Add calibration config with threshold trials
-    if args.calibrate:
-        sparse_cfg["calibration"] = {
-            "target_sparse_ratio": {"prefill": args.target_sparsity},
-            "samples": args.calib_size,
-            "threshold_trials": DEFAULT_THRESHOLD_TRIALS,
-        }
-
-    return config
-
-
-def load_calib_prompts(calib_size: int) -> list[str]:
-    """Load calibration prompts from OpenVid-1M dataset."""
-    from datasets import load_dataset
-
-    dataset = load_dataset("nkp37/OpenVid-1M")
-    prompts = list(dataset["train"]["caption"][:calib_size])
-    print(f"Loaded {len(prompts)} calibration prompts from OpenVid-1M")
-    return prompts
-
-
-def build_calibration_forward_loop(
-    pipeline: TI2VidTwoStagesPipeline,
-    num_steps: int = 10,
-    num_frames: int = 81,
-    calib_size: int = 1,
-):
-    """Build a forward loop for exponential model calibration.
-
-    Generates short videos to exercise the attention mechanism at various
-    threshold trials, collecting sparsity statistics for the exponential fit.
-    """
-    calib_prompts = load_calib_prompts(calib_size)
-    tiling_config = TilingConfig.default()
-
-    def forward_loop(model):
-        for i, prompt in enumerate(calib_prompts):
-            print(f"Calibration [{i + 1}/{len(calib_prompts)}]: {prompt[:60]}...")
-            pipeline(
-                prompt=prompt,
-                negative_prompt=DEFAULT_NEGATIVE_PROMPT,
-                seed=DEFAULT_SEED,
-                height=DEFAULT_2_STAGE_HEIGHT,
-                width=DEFAULT_2_STAGE_WIDTH,
-                num_frames=num_frames,
-                frame_rate=DEFAULT_FRAME_RATE,
-                num_inference_steps=num_steps,
-                video_guider_params=DEFAULT_VIDEO_GUIDER_PARAMS,
-                audio_guider_params=DEFAULT_AUDIO_GUIDER_PARAMS,
-                images=[],
-                tiling_config=tiling_config,
-            )
-
-    return forward_loop
-
-
-def print_sparsity_summary(transformer: torch.nn.Module) -> None:
-    """Print per-module sparsity statistics."""
-    enabled, disabled = [], []
-    for name, module in transformer.named_modules():
-        if isinstance(module, SparseAttentionModule):
-            if module.is_enabled:
-                enabled.append((name, module))
-            else:
-                disabled.append(name)
-
-    print(f"\nSparse attention: {len(enabled)} enabled, {len(disabled)} disabled")
-    for name, module in enabled:
-        info = module.get_threshold_info()
-        print(f"  {name}: {info}")
-
-
-def main() -> None:
-    args = parse_args()
-
-    # ---- Build pipeline ----
-    print("Building LTX-2 pipeline...")
-    pipeline = build_pipeline()
-
-    # ---- Get and sparsify the stage-1 transformer ----
-    transformer = pipeline.stage_1_model_ledger.transformer()
-    # Pin transformer in memory so pipeline reuses the sparsified version
-    pipeline.stage_1_model_ledger.transformer = lambda: transformer
-
-    config = build_sparse_config(args)
-    forward_loop = None
-    if args.calibrate:
-        forward_loop = build_calibration_forward_loop(
-            pipeline,
-            num_steps=args.calib_steps,
-            num_frames=args.calib_frames,
-            calib_size=args.calib_size,
-        )
-
-    print("Applying skip-softmax sparse attention...")
-    mtsa.sparsify(transformer, config, forward_loop=forward_loop)
-
-    # ---- Build prompt list ----
-    prompts_and_outputs: list[tuple[str, str]] = []
-    if args.prompt_dir:
-        output_dir = args.output_dir or "output_videos"
-        os.makedirs(output_dir, exist_ok=True)
-        prompt_files = sorted(f for f in os.listdir(args.prompt_dir) if f.endswith(".txt"))
-        for pf in prompt_files:
-            with open(os.path.join(args.prompt_dir, pf)) as f:
-                prompt = f.read().strip()
-            stem = os.path.splitext(pf)[0]
-            prompts_and_outputs.append((prompt, os.path.join(output_dir, f"{stem}.mp4")))
-    elif args.prompt:
-        prompts_and_outputs.append((args.prompt, args.output))
-    else:
-        raise ValueError("Either --prompt or --prompt-dir must be provided")
-
-    # ---- Generate ----
-    tiling_config = TilingConfig.default()
-    for i, (prompt, output_path) in enumerate(prompts_and_outputs):
-        print(f"\nGenerating [{i + 1}/{len(prompts_and_outputs)}]: {prompt[:80]}...")
-
-        video, audio = pipeline(
-            prompt=prompt,
-            negative_prompt=DEFAULT_NEGATIVE_PROMPT,
-            seed=args.seed,
-            height=args.height,
-            width=args.width,
-            num_frames=args.num_frames,
-            frame_rate=DEFAULT_FRAME_RATE,
-            num_inference_steps=DEFAULT_NUM_INFERENCE_STEPS,
-            video_guider_params=DEFAULT_VIDEO_GUIDER_PARAMS,
-            audio_guider_params=DEFAULT_AUDIO_GUIDER_PARAMS,
-            images=[],
-            tiling_config=tiling_config,
-        )
-
-        encode_video(
-            video=video,
-            fps=DEFAULT_FRAME_RATE,
-            audio=audio,
-            audio_sample_rate=AUDIO_SAMPLE_RATE,
-            output_path=output_path,
-            video_chunks_number=get_video_chunks_number(args.num_frames, tiling_config),
-        )
-        print(f"Saved to {output_path}")
-
-    # ---- Print stats ----
-    print_sparsity_summary(transformer)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/diffusers/sparsity/wan22_skip_softmax.py b/examples/diffusers/sparsity/wan22_skip_softmax.py
index cb30321e3c..74be8cbae2 100644
--- a/examples/diffusers/sparsity/wan22_skip_softmax.py
+++ b/examples/diffusers/sparsity/wan22_skip_softmax.py
@@ -48,8 +48,7 @@
 import modelopt.torch.sparsity.attention_sparsity as mtsa
 from modelopt.torch.sparsity.attention_sparsity.sparse_attention import SparseAttentionModule
 
-DEFAULT_MODEL_PATH = os.environ.get("WAN22_MODEL_PATH", "Wan-AI/Wan2.2-T2V-5B")
-NUM_TRANSFORMER_BLOCKS = 40
+DEFAULT_MODEL_PATH = os.environ.get("WAN22_MODEL_PATH", "Wan-AI/Wan2.2-TI2V-5B-Diffusers")
 
 # Default threshold trials for calibration
 DEFAULT_THRESHOLD_TRIALS = [
@@ -69,6 +68,9 @@
     3e-1,
     5e-1,
     7e-1,
+    8e-1,
+    9e-1,
+    9.9e-1,
 ]
 
 
@@ -109,20 +111,26 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--target-sparsity",
         type=float,
-        default=0.25,
+        default=0.5,
         help="Target sparsity ratio for calibration (0.0-1.0)",
     )
     parser.add_argument(
         "--calib-steps",
         type=int,
-        default=10,
+        default=40,
         help="Inference steps for calibration",
     )
     parser.add_argument(
         "--calib-frames",
         type=int,
-        default=33,
-        help="Number of frames for calibration (fewer = faster)",
+        default=151,
+        help="Number of frames for calibration",
+    )
+    parser.add_argument(
+        "--calib-size",
+        type=int,
+        default=4,
+        help="Number of calibration prompts from OpenVid-1M dataset",
     )
     return parser.parse_args()
 
@@ -135,7 +143,7 @@ def build_pipeline(model_path: str) -> WanPipeline:
     return pipe
 
 
-def build_sparse_config(args: argparse.Namespace) -> dict:
+def build_sparse_config(args: argparse.Namespace, num_blocks: int) -> dict:
     """Build sparse attention config from CLI args.
 
     Uses flash_skip_softmax which supports both calibration (eager attention
@@ -162,7 +170,7 @@ def build_sparse_config(args: argparse.Namespace) -> dict:
     # Keep first/last N layers dense for quality
     for i in range(args.skip_first_last):
         sparse_cfg[f"*blocks.{i}.attn*"] = {"enable": False}
-        sparse_cfg[f"*blocks.{NUM_TRANSFORMER_BLOCKS - 1 - i}.attn*"] = {"enable": False}
+        sparse_cfg[f"*blocks.{num_blocks - 1 - i}.attn*"] = {"enable": False}
 
     config: dict = {"sparse_cfg": sparse_cfg}
 
@@ -177,28 +185,44 @@ def build_sparse_config(args: argparse.Namespace) -> dict:
     return config
 
 
+def load_calib_prompts(calib_size: int) -> list[str]:
+    """Load calibration prompts from OpenVid-1M dataset."""
+    from datasets import load_dataset
+
+    dataset = load_dataset("nkp37/OpenVid-1M", split="train")
+    prompts = list(dataset["caption"][:calib_size])
+    print(f"Loaded {len(prompts)} calibration prompts from OpenVid-1M")
+    return prompts
+
+
 def build_calibration_forward_loop(
     pipe: WanPipeline,
-    prompt: str,
-    num_steps: int = 10,
-    num_frames: int = 33,
+    calib_size: int = 4,
+    num_steps: int = 40,
+    num_frames: int = 151,
     height: int = 480,
     width: int = 832,
     seed: int = 42,
 ):
-    """Build a forward loop for exponential model calibration."""
+    """Build a forward loop for exponential model calibration.
+
+    Uses prompts from OpenVid-1M dataset (same as quantization examples).
+    Each prompt is run individually (batch_size=1).
+    """
+    calib_prompts = load_calib_prompts(calib_size)
 
     def forward_loop(model):
-        print(f"Calibration: generating {num_frames} frames @ {height}x{width}...")
-        pipe(
-            prompt=prompt,
-            num_frames=num_frames,
-            height=height,
-            width=width,
-            num_inference_steps=num_steps,
-            guidance_scale=5.0,
-            generator=torch.Generator(device="cuda").manual_seed(seed),
-        )
+        for i, prompt in enumerate(calib_prompts):
+            print(f"Calibration [{i + 1}/{len(calib_prompts)}]: {prompt[:60]}...")
+            pipe(
+                prompt=prompt,
+                num_frames=num_frames,
+                height=height,
+                width=width,
+                num_inference_steps=num_steps,
+                guidance_scale=5.0,
+                generator=torch.Generator(device="cuda").manual_seed(seed),
+            )
 
     return forward_loop
 
@@ -219,6 +243,17 @@ def print_sparsity_summary(model: torch.nn.Module) -> None:
         print(f"  {name}: {info}")
 
 
+def _get_num_blocks(transformer: torch.nn.Module) -> int:
+    """Count transformer blocks by looking for *.blocks.N.* submodules."""
+    max_idx = -1
+    for name, _ in transformer.named_modules():
+        parts = name.split(".")
+        for i, part in enumerate(parts):
+            if part == "blocks" and i + 1 < len(parts) and parts[i + 1].isdigit():
+                max_idx = max(max_idx, int(parts[i + 1]))
+    return max_idx + 1
+
+
 def main() -> None:
     args = parse_args()
 
@@ -226,15 +261,20 @@ def main() -> None:
     print(f"Loading Wan 2.2 from {args.model_path}...")
     pipe = build_pipeline(args.model_path)
 
-    # ---- Get and sparsify the transformer ----
-    transformer = pipe.transformer
+    # ---- Collect transformers to sparsify ----
+    # Wan 2.2 5B has one transformer; 14B has two (transformer + transformer_2)
+    transformers_to_sparsify = []
+    if pipe.transformer is not None:
+        transformers_to_sparsify.append(("transformer", pipe.transformer))
+    if getattr(pipe, "transformer_2", None) is not None:
+        transformers_to_sparsify.append(("transformer_2", pipe.transformer_2))
 
-    config = build_sparse_config(args)
+    # ---- Build calibration forward loop (shared across transformers) ----
     forward_loop = None
     if args.calibrate:
         forward_loop = build_calibration_forward_loop(
             pipe,
-            prompt=args.prompt,
+            calib_size=args.calib_size,
             num_steps=args.calib_steps,
             num_frames=args.calib_frames,
             height=args.height,
@@ -242,8 +282,12 @@ def main() -> None:
             seed=args.seed,
         )
 
-    print("Applying skip-softmax sparse attention...")
-    mtsa.sparsify(transformer, config, forward_loop=forward_loop)
+    # ---- Sparsify each transformer ----
+    for name, transformer in transformers_to_sparsify:
+        num_blocks = _get_num_blocks(transformer)
+        print(f"Applying skip-softmax to {name} ({num_blocks} blocks)...")
+        config = build_sparse_config(args, num_blocks=num_blocks)
+        mtsa.sparsify(transformer, config, forward_loop=forward_loop)
 
     # ---- Generate ----
     print(f"Generating: {args.prompt[:80]}...")
@@ -261,7 +305,9 @@ def main() -> None:
     print(f"Saved to {args.output}")
 
     # ---- Print stats ----
-    print_sparsity_summary(transformer)
+    for name, transformer in transformers_to_sparsify:
+        print(f"\n{name}:")
+        print_sparsity_summary(transformer)
 
 
 if __name__ == "__main__":

From b3d3d4d87858a7a6e195aea1649dc240ec81b606 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Tue, 7 Apr 2026 05:22:37 +0000
Subject: [PATCH 09/21] Update the readme

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 examples/diffusers/README.md                  | 10 +++---
 .../diffusers/sparsity/wan22_skip_softmax.py  | 34 +++++++++++--------
 2 files changed, 25 insertions(+), 19 deletions(-)

diff --git a/examples/diffusers/README.md b/examples/diffusers/README.md
index 551fd6ec9f..ae3940f73a 100644
--- a/examples/diffusers/README.md
+++ b/examples/diffusers/README.md
@@ -327,17 +327,17 @@ output = pipeline(prompt="a dog on the beach", ...)
 The 14B model automatically sparsifies both `transformer` and `transformer_2`.
 
 ```bash
-# 5B model (4 calibration prompts from OpenVid-1M, 151 frames, 40 steps)
+# 5B model — calibrate + generate (4 prompts from OpenVid-1M, 151 frames, 40 steps)
 python sparsity/wan22_skip_softmax.py \
     --model-path Wan-AI/Wan2.2-TI2V-5B-Diffusers \
-    --prompt "A sunset over mountains" --output out.mp4 \
-    --calibrate --target-sparsity 0.5 --calib-size 4
+    --calibrate --target-sparsity 0.5 --calib-size 4 \
+    --prompt "A sunset over mountains" --output out.mp4
 
 # 14B model (both transformers sparsified)
 python sparsity/wan22_skip_softmax.py \
     --model-path Wan-AI/Wan2.2-T2V-A14B-Diffusers \
-    --prompt "A sunset over mountains" --output out.mp4 \
-    --calibrate --target-sparsity 0.5 --calib-size 4
+    --calibrate --target-sparsity 0.5 --calib-size 4 \
+    --prompt "A sunset over mountains" --output out.mp4
 ```
 
 ## Cache Diffusion
diff --git a/examples/diffusers/sparsity/wan22_skip_softmax.py b/examples/diffusers/sparsity/wan22_skip_softmax.py
index 74be8cbae2..2456186fa0 100644
--- a/examples/diffusers/sparsity/wan22_skip_softmax.py
+++ b/examples/diffusers/sparsity/wan22_skip_softmax.py
@@ -78,7 +78,12 @@ def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
         description="Wan 2.2 video generation with skip-softmax sparse attention"
     )
-    parser.add_argument("--prompt", type=str, required=True, help="Text prompt for generation")
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        default=None,
+        help="Text prompt for generation (optional, skips generation if not set)",
+    )
     parser.add_argument("--output", type=str, default="output.mp4", help="Output video path")
     parser.add_argument(
         "--model-path", type=str, default=DEFAULT_MODEL_PATH, help="Wan 2.2 model path or HF ID"
@@ -289,20 +294,21 @@ def main() -> None:
         config = build_sparse_config(args, num_blocks=num_blocks)
         mtsa.sparsify(transformer, config, forward_loop=forward_loop)
 
-    # ---- Generate ----
-    print(f"Generating: {args.prompt[:80]}...")
-    output = pipe(
-        prompt=args.prompt,
-        num_frames=args.num_frames,
-        height=args.height,
-        width=args.width,
-        num_inference_steps=args.num_steps,
-        guidance_scale=args.guidance_scale,
-        generator=torch.Generator(device="cuda").manual_seed(args.seed),
-    )
+    # ---- Generate (optional) ----
+    if args.prompt:
+        print(f"Generating: {args.prompt[:80]}...")
+        output = pipe(
+            prompt=args.prompt,
+            num_frames=args.num_frames,
+            height=args.height,
+            width=args.width,
+            num_inference_steps=args.num_steps,
+            guidance_scale=args.guidance_scale,
+            generator=torch.Generator(device="cuda").manual_seed(args.seed),
+        )
 
-    export_to_video(output.frames[0], args.output, fps=16)
-    print(f"Saved to {args.output}")
+        export_to_video(output.frames[0], args.output, fps=16)
+        print(f"Saved to {args.output}")
 
     # ---- Print stats ----
     for name, transformer in transformers_to_sparsify:

From 8dc6162fbade9c8061a0170560dde5460c1b8bf7 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Tue, 7 Apr 2026 19:54:03 +0000
Subject: [PATCH 10/21] Update the calibration kernel

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 examples/diffusers/README.md                  |   6 +-
 .../diffusers/sparsity/wan22_skip_softmax.py  |  15 +-
 modelopt/torch/kernels/__init__.py            |   4 +
 modelopt/torch/kernels/triton_fa.py           | 221 +++++++++++++++++-
 .../calibration/calibrator.py                 |  14 +-
 .../kernels/diffusers_triton_attention.py     |  72 ++++--
 .../kernels/ltx_triton_attention.py           |  74 +++---
 .../attention_sparsity/methods/registry.py    |   4 +
 .../methods/triton_skip_softmax.py            | 161 ++++++++++++-
 .../attention_sparsity/stats_manager.py       |  13 +-
 10 files changed, 501 insertions(+), 83 deletions(-)

diff --git a/examples/diffusers/README.md b/examples/diffusers/README.md
index ae3940f73a..a64ea2b1b4 100644
--- a/examples/diffusers/README.md
+++ b/examples/diffusers/README.md
@@ -296,10 +296,8 @@ config = {
                                  8e-1, 9e-1, 9.9e-1],
         },
         "*.attn1": {
-            "method": "flash_skip_softmax",
-            "thresholds": {"prefill": [1e-3]},
-            "br": 128, "bc": 128,
-            "backend": "pytorch",
+            "method": "triton_skip_softmax",
+            "backend": "triton",
             "is_causal": False,
             "collect_stats": True,
             "enable": True,
diff --git a/examples/diffusers/sparsity/wan22_skip_softmax.py b/examples/diffusers/sparsity/wan22_skip_softmax.py
index 2456186fa0..6c6e491a25 100644
--- a/examples/diffusers/sparsity/wan22_skip_softmax.py
+++ b/examples/diffusers/sparsity/wan22_skip_softmax.py
@@ -151,16 +151,15 @@ def build_pipeline(model_path: str) -> WanPipeline:
 def build_sparse_config(args: argparse.Namespace, num_blocks: int) -> dict:
     """Build sparse attention config from CLI args.
 
-    Uses flash_skip_softmax which supports both calibration (eager attention
-    with F.softmax patching) and inference. Calibration fits an exponential
-    model: scale_factor = a * exp(b * sparsity).
+    Uses triton_skip_softmax with the Triton FA kernel for both calibration
+    and inference. Calibration collects multi-threshold sparsity statistics
+    via the Triton calibration kernel, then fits an exponential model:
+    scale_factor = a * exp(b * sparsity).
     """
     attn_cfg: dict = {
-        "method": "flash_skip_softmax",
-        "thresholds": {"prefill": [1e-3]},
-        "br": 128,
-        "bc": 128,
-        "backend": "pytorch",
+        "method": "triton_skip_softmax",
+        "skip_softmax_threshold": 0.1,
+        "backend": "triton",
         "is_causal": False,  # Diffusion = bidirectional attention
         "collect_stats": True,
         "enable": True,
diff --git a/modelopt/torch/kernels/__init__.py b/modelopt/torch/kernels/__init__.py
index 24d27a1ba2..fa07b06e20 100644
--- a/modelopt/torch/kernels/__init__.py
+++ b/modelopt/torch/kernels/__init__.py
@@ -21,6 +21,7 @@
 
 IS_AVAILABLE = False
 attention = None
+attention_calibrate = None
 register_triton_attention = None
 
 if torch.cuda.is_available():
@@ -32,8 +33,10 @@
         ),
     ):
         from .triton_fa import attention as _attention
+        from .triton_fa import attention_calibrate as _attention_calibrate
 
         attention = _attention
+        attention_calibrate = _attention_calibrate
         IS_AVAILABLE = True
         from .hf_triton_attention import register_triton_attention as _register_triton_attention
 
@@ -42,5 +45,6 @@
 __all__ = [
     "IS_AVAILABLE",
     "attention",
+    "attention_calibrate",
     "register_triton_attention",
 ]
diff --git a/modelopt/torch/kernels/triton_fa.py b/modelopt/torch/kernels/triton_fa.py
index 8d3b11f1af..8de2dac6d7 100644
--- a/modelopt/torch/kernels/triton_fa.py
+++ b/modelopt/torch/kernels/triton_fa.py
@@ -1062,4 +1062,223 @@ def attention(
     )
 
 
-__all__ = ["attention"]
+# ---------------------------------------------------------------------------
+# Calibration kernel: collect multi-threshold skip-softmax sparsity stats
+# ---------------------------------------------------------------------------
+@triton.jit
+def _attn_fwd_calibrate(
+    Q,
+    K,
+    V,
+    qk_scale,
+    b_start_loc,
+    b_seq_len,
+    b_start_loc_k,
+    b_seq_len_k,
+    Out,
+    stride_qbs,
+    stride_qh,
+    stride_kbs,
+    stride_kh,
+    stride_vbs,
+    stride_vh,
+    stride_obs,
+    stride_oh,
+    Threshold_trials,  # [NUM_THRESHOLDS] float32 — pre-scaled to log2 space
+    Sparsity_counters,  # [NUM_THRESHOLDS * 2] int64 — [total, skipped] per threshold
+    kv_group_num: tl.constexpr,
+    BLOCK_M: tl.constexpr,
+    BLOCK_D: tl.constexpr,
+    BLOCK_N: tl.constexpr,
+    IS_CAUSAL: tl.constexpr,
+    HEAD_DIM: tl.constexpr,
+    NUM_THRESHOLDS: tl.constexpr,
+):
+    """Forward kernel with multi-threshold sparsity measurement.
+
+    Computes full attention (no skipping) while counting how many KV tiles
+    would be skipped at each threshold. Statistics are collected via atomic
+    adds to ``Sparsity_counters[t*2]`` (total tiles) and
+    ``Sparsity_counters[t*2+1]`` (skipped tiles).
+    """
+    batch_idx = tl.program_id(0)
+    head_idx = tl.program_id(1)
+    tile_q = tl.program_id(2)
+    kv_head_idx = head_idx // kv_group_num
+
+    seq_len_q = tl.load(b_seq_len + batch_idx)
+    seq_len_kv = tl.load(b_seq_len_k + batch_idx)
+    q_offset = tl.load(b_start_loc + batch_idx)
+    kv_offset = tl.load(b_start_loc_k + batch_idx)
+
+    if tile_q * BLOCK_M >= seq_len_q:
+        return
+
+    q_pos = tile_q * BLOCK_M + tl.arange(0, BLOCK_M)
+    kv_pos = tl.arange(0, BLOCK_N)
+    dim_pos = tl.arange(0, BLOCK_D)
+    d_mask = dim_pos < HEAD_DIM
+
+    q_ptrs = (q_offset + q_pos[:, None]) * stride_qbs + head_idx * stride_qh + dim_pos[None, :]
+    q = tl.load(Q + q_ptrs, mask=(q_pos[:, None] < seq_len_q) & d_mask[None, :], other=0.0)
+
+    k_base = K + kv_head_idx * stride_kh
+    v_base = V + kv_head_idx * stride_vh
+
+    row_max = tl.zeros([BLOCK_M], dtype=tl.float32) - float("inf")
+    row_sum = tl.zeros([BLOCK_M], dtype=tl.float32)
+    acc = tl.zeros([BLOCK_M, BLOCK_D], dtype=tl.float32)
+
+    kv_bound = seq_len_kv if not IS_CAUSAL else tl.minimum((tile_q + 1) * BLOCK_M, seq_len_kv)
+
+    for kv_start in range(0, kv_bound, BLOCK_N):
+        kv_start = tl.multiple_of(kv_start, BLOCK_N)
+
+        k_offs = (kv_offset + kv_start + kv_pos[None, :]) * stride_kbs + dim_pos[:, None]
+        k = tl.load(
+            k_base + k_offs,
+            mask=((kv_start + kv_pos[None, :]) < seq_len_kv) & d_mask[:, None],
+            other=0.0,
+        )
+
+        scores = tl.dot(q, k) * qk_scale
+        scores = _apply_mask(scores, q_pos, kv_pos, seq_len_q, seq_len_kv, kv_start, IS_CAUSAL)
+
+        tile_row_max = tl.max(scores, 1)
+
+        # --- Multi-threshold sparsity measurement ---
+        for t in range(NUM_THRESHOLDS):
+            thresh = tl.load(Threshold_trials + t)
+            can_skip = tile_row_max < (row_max + thresh)
+            skip_tile = tl.min(can_skip.to(tl.int32)) == 1
+            tl.atomic_add(Sparsity_counters + t * 2, 1)  # total tiles
+            if skip_tile:
+                tl.atomic_add(Sparsity_counters + t * 2 + 1, 1)  # skipped tiles
+
+        # --- Always compute full attention (no skipping) ---
+        m_new = tl.maximum(row_max, tile_row_max)
+        p = tl.math.exp2(scores - m_new[:, None])
+        l_new = tl.sum(p, 1)
+        correction = tl.math.exp2(row_max - m_new)
+        row_sum = row_sum * correction + l_new
+        acc = acc * correction[:, None]
+
+        v_offs = (kv_offset + kv_start + kv_pos[:, None]) * stride_vbs + dim_pos[None, :]
+        v = tl.load(
+            v_base + v_offs,
+            mask=((kv_start + kv_pos[:, None]) < seq_len_kv) & d_mask[None, :],
+            other=0.0,
+        )
+        acc = tl.dot(p.to(v.dtype), v, acc)
+        row_max = m_new
+
+    acc = acc / row_sum[:, None]
+    o_ptrs = (q_offset + q_pos[:, None]) * stride_obs + head_idx * stride_oh + dim_pos[None, :]
+    tl.store(Out + o_ptrs, acc, mask=(q_pos[:, None] < seq_len_q) & d_mask[None, :])
+
+
+def attention_calibrate(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    b_start_loc: torch.Tensor,
+    b_seq_len: torch.Tensor,
+    max_input_len: int,
+    is_causal: bool = True,
+    softmax_scale: float | None = None,
+    b_start_loc_k: torch.Tensor | None = None,
+    b_seq_len_k: torch.Tensor | None = None,
+    max_input_len_k: int | None = None,
+    *,
+    threshold_trials: list[float] | None = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    """Flash attention with multi-threshold skip-softmax sparsity measurement.
+
+    Computes full attention (identical output to dense attention) while
+    measuring how many KV tiles would be skipped at each threshold in
+    ``threshold_trials``. No autograd — forward only.
+
+    Args:
+        q, k, v, b_start_loc, b_seq_len, max_input_len, is_causal,
+        softmax_scale, b_start_loc_k, b_seq_len_k, max_input_len_k:
+            Same as :func:`attention`.
+        threshold_trials: List of threshold values to measure sparsity for.
+            Each value is converted to log2-scaled space for the kernel.
+
+    Returns:
+        Tuple of (output, sparsity_counters):
+        - output: ``[total_q_tokens, num_q_heads, head_dim]``
+        - sparsity_counters: ``[num_thresholds, 2]`` int64 tensor where
+          ``[:, 0]`` = total tile evaluations, ``[:, 1]`` = skipped tiles.
+          Sparsity per threshold = ``counters[:, 1] / counters[:, 0]``.
+    """
+    if threshold_trials is None or len(threshold_trials) == 0:
+        raise ValueError("threshold_trials must be a non-empty list")
+
+    HEAD_DIM = q.shape[2]
+    num_q_heads = q.shape[1]
+    num_kv_heads = k.shape[1]
+    kv_group_num = num_q_heads // num_kv_heads
+    batch = b_seq_len.shape[0]
+    sm_scale = 1.0 / (HEAD_DIM**0.5) if softmax_scale is None else softmax_scale
+    qk_scale = sm_scale * LOG2E
+    BLOCK_D = triton.next_power_of_2(HEAD_DIM)
+    BLOCK_M = 128
+    BLOCK_N = 64
+
+    if b_seq_len_k is None:
+        b_seq_len_k = b_seq_len
+        b_start_loc_k = b_start_loc
+
+    num_thresholds = len(threshold_trials)
+
+    # Convert thresholds to log2-scaled space: log2(lambda) * sm_scale
+    threshold_tensor = torch.tensor(
+        [math.log2(t) * sm_scale for t in threshold_trials],
+        dtype=torch.float32,
+        device=q.device,
+    )
+
+    # Atomic counters: [num_thresholds * 2] — flat layout [total_0, skip_0, total_1, skip_1, ...]
+    sparsity_counters = torch.zeros(num_thresholds * 2, dtype=torch.int64, device=q.device)
+
+    o = torch.empty_like(q)
+
+    grid = (batch, num_q_heads, triton.cdiv(max_input_len, BLOCK_M))
+
+    _attn_fwd_calibrate[grid](
+        q,
+        k,
+        v,
+        qk_scale,
+        b_start_loc,
+        b_seq_len,
+        b_start_loc_k,
+        b_seq_len_k,
+        o,
+        q.stride(0),
+        q.stride(1),
+        k.stride(0),
+        k.stride(1),
+        v.stride(0),
+        v.stride(1),
+        o.stride(0),
+        o.stride(1),
+        threshold_tensor,
+        sparsity_counters,
+        kv_group_num=kv_group_num,
+        BLOCK_M=BLOCK_M,
+        BLOCK_D=BLOCK_D,
+        BLOCK_N=BLOCK_N,
+        IS_CAUSAL=is_causal,
+        HEAD_DIM=HEAD_DIM,
+        NUM_THRESHOLDS=num_thresholds,
+        num_warps=4,
+        num_stages=1,
+    )
+
+    # Reshape to [num_thresholds, 2]
+    return o, sparsity_counters.view(num_thresholds, 2)
+
+
+__all__ = ["attention", "attention_calibrate"]
diff --git a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py
index 6821206937..f2bb8831d7 100644
--- a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py
+++ b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py
@@ -333,6 +333,16 @@ def _extract_calibration_stats(
         return aggregated_stats
 
     def _set_thresholds(self, modules: list[nn.Module], thresholds: list[float]):
-        """Set thresholds list on sparse attention modules."""
+        """Set thresholds list on sparse attention modules.
+
+        Supports both flash_skip_softmax (sets ``thresholds`` attribute) and
+        triton_skip_softmax (sets ``_threshold_trials`` attribute).
+        """
         for module in modules:
-            module._sparse_method_instance.thresholds = thresholds
+            method = module._sparse_method_instance
+            if hasattr(method, "_threshold_trials"):
+                # triton_skip_softmax: calibration uses Triton calibration kernel
+                method._threshold_trials = thresholds
+            else:
+                # flash_skip_softmax: calibration uses F.softmax patching
+                method.thresholds = thresholds
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
index 17fec4e4eb..2e44d41258 100644
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
@@ -17,9 +17,12 @@
 
 Registers a ``modelopt_triton`` backend in diffusers' ``_AttentionBackendRegistry``
 that converts the diffusers [B, S, H, D] layout to the Triton FA kernel's varlen
-[total_tokens, H, D] format. Supports skip-softmax tile skipping for sparse attention.
+[total_tokens, H, D] format.
 
-Used during **inference** -- calibration uses the eager backend instead.
+Two modes:
+- **Inference**: Calls ``attention()`` with skip-softmax tile skipping.
+- **Calibration**: Calls ``attention_calibrate()`` to collect multi-threshold
+  sparsity statistics without skipping any tiles.
 """
 
 import inspect
@@ -33,24 +36,47 @@
     attention_backend,
 )
 
-from modelopt.torch.kernels import attention
+from modelopt.torch.kernels import attention, attention_calibrate
 
 _BACKEND_NAME = "modelopt_triton"
 _BACKEND_REGISTERED = False
 
 # Thread-local storage for per-forward skip-softmax configuration.
-# The method's get_sparse_context() sets these before each forward pass.
 _thread_local = threading.local()
 
 
-def set_triton_skip_softmax_config(threshold: float | None = None) -> None:
-    """Set thread-local skip-softmax config for the next Triton attention call."""
+def set_triton_skip_softmax_config(
+    threshold: float | None = None,
+    calibration_mode: bool = False,
+    threshold_trials: list[float] | None = None,
+) -> None:
+    """Set thread-local skip-softmax config for the next Triton attention call.
+
+    Args:
+        threshold: Skip-softmax threshold for inference mode.
+        calibration_mode: If True, use the calibration kernel to collect
+            multi-threshold sparsity stats instead of skipping tiles.
+        threshold_trials: List of thresholds to measure sparsity for
+            (only used when calibration_mode=True).
+    """
     _thread_local.skip_threshold = threshold
+    _thread_local.calibration_mode = calibration_mode
+    _thread_local.threshold_trials = threshold_trials
+    # Accumulated counters across all attention calls in one forward pass
+    _thread_local.calibration_counters = None
 
 
 def clear_triton_skip_softmax_config() -> None:
     """Clear thread-local skip-softmax config."""
     _thread_local.skip_threshold = None
+    _thread_local.calibration_mode = False
+    _thread_local.threshold_trials = None
+    _thread_local.calibration_counters = None
+
+
+def get_calibration_counters() -> "torch.Tensor | None":
+    """Return accumulated calibration counters ``[num_thresholds, 2]`` or None."""
+    return getattr(_thread_local, "calibration_counters", None)
 
 
 # ---------------------------------------------------------------------------
@@ -68,11 +94,7 @@ def _diffusers_triton_attention(
     scale: float | None = None,
     enable_gqa: bool = False,
 ) -> torch.Tensor:
-    """Compute attention via Triton FA kernel on diffusers layout ``[B, S, H, D]``.
-
-    Converts to the kernel's varlen format, calls the Triton FA kernel, and
-    converts back.
-    """
+    """Compute attention via Triton FA kernel on diffusers layout ``[B, S, H, D]``."""
     batch, seq_q, num_heads_q, head_dim = query.shape
     seq_k = key.shape[1]
     device = query.device
@@ -97,7 +119,6 @@ def _diffusers_triton_attention(
         "softmax_scale": scale,
     }
 
-    # If Q and KV have different sequence lengths, pass separate KV metadata
     if seq_q != seq_k:
         b_start_loc_k = torch.arange(batch, device=device, dtype=torch.int32) * seq_k
         b_seq_len_k = torch.full((batch,), seq_k, device=device, dtype=torch.int32)
@@ -105,15 +126,29 @@ def _diffusers_triton_attention(
         kw["b_seq_len_k"] = b_seq_len_k
         kw["max_input_len_k"] = seq_k
 
-    # Read skip-softmax config from thread-local storage
+    # --- Calibration mode: collect multi-threshold stats ---
+    calib_mode = getattr(_thread_local, "calibration_mode", False)
+    if calib_mode:
+        trials = getattr(_thread_local, "threshold_trials", None)
+        if trials and attention_calibrate is not None:
+            o, counters = attention_calibrate(q, k, v, **kw, threshold_trials=trials)
+
+            # Accumulate counters across all attention calls in this forward pass
+            prev = getattr(_thread_local, "calibration_counters", None)
+            if prev is None:
+                _thread_local.calibration_counters = counters
+            else:
+                _thread_local.calibration_counters = prev + counters
+
+            return o.view(batch, seq_q, num_heads_q, head_dim)
+
+    # --- Inference mode: skip-softmax with single threshold ---
     threshold = getattr(_thread_local, "skip_threshold", None)
     if threshold is not None and threshold > 0.0:
         kw["skip_softmax_threshold"] = threshold
 
     assert attention is not None, "Triton attention kernel not available (requires CUDA + triton)"
     o = attention(q, k, v, **kw)
-
-    # Reshape back: [B*S, H, D] -> [B, S, H, D]
     return o.view(batch, seq_q, num_heads_q, head_dim)
 
 
@@ -131,14 +166,12 @@ def register_diffusers_triton_attention() -> None:
     if _BACKEND_REGISTERED:
         return
 
-    # Extend the AttentionBackendName enum with our custom value
     new_member = str.__new__(AttentionBackendName, _BACKEND_NAME)
     new_member._name_ = "MODELOPT_TRITON"
     new_member._value_ = _BACKEND_NAME
     AttentionBackendName._member_map_["MODELOPT_TRITON"] = new_member
     AttentionBackendName._value2member_map_[_BACKEND_NAME] = new_member
 
-    # Register the backend function
     _AttentionBackendRegistry._backends[new_member] = _diffusers_triton_attention
     _AttentionBackendRegistry._constraints[new_member] = []
     _AttentionBackendRegistry._supported_arg_names[new_member] = set(
@@ -149,10 +182,7 @@ def register_diffusers_triton_attention() -> None:
 
 
 def get_triton_attention_backend():
-    """Return a context manager that activates the modelopt_triton backend.
-
-    Raises RuntimeError if the backend has not been registered yet.
-    """
+    """Return a context manager that activates the modelopt_triton backend."""
     if not _BACKEND_REGISTERED:
         raise RuntimeError(
             "modelopt_triton backend not registered. "
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
index ddb880026c..8ef2569d5b 100644
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
@@ -15,11 +15,9 @@
 
 """Triton flash attention wrapper for LTX-2 (ltx_core) skip-softmax sparse attention.
 
-Patches ``Attention`` modules from ``ltx_core`` so that when the Triton
-skip-softmax flag is active, attention is computed via the Triton FA kernel
-with fused tile skipping.
-
-Used during **inference** -- calibration uses the eager wrapper instead.
+Two modes:
+- **Inference**: ``attention()`` with skip-softmax tile skipping.
+- **Calibration**: ``attention_calibrate()`` to collect multi-threshold stats.
 """
 
 import math
@@ -28,7 +26,7 @@
 import torch
 from ltx_core.model.transformer.attention import Attention
 
-from modelopt.torch.kernels import attention
+from modelopt.torch.kernels import attention, attention_calibrate
 
 # Thread-local storage for skip-softmax configuration
 _thread_local = threading.local()
@@ -37,16 +35,25 @@
 def set_ltx_triton_context(
     active: bool,
     threshold: float | None = None,
+    calibration_mode: bool = False,
+    threshold_trials: list[float] | None = None,
 ) -> None:
     """Set thread-local Triton config for LTX-2 attention."""
     _thread_local.active = active
     _thread_local.threshold = threshold
+    _thread_local.calibration_mode = calibration_mode
+    _thread_local.threshold_trials = threshold_trials
+    if not calibration_mode:
+        _thread_local.calibration_counters = None
 
 
 def clear_ltx_triton_context() -> None:
     """Clear thread-local Triton config."""
     _thread_local.active = False
     _thread_local.threshold = None
+    _thread_local.calibration_mode = False
+    _thread_local.threshold_trials = None
+    _thread_local.calibration_counters = None
 
 
 def _get_ltx_triton_context() -> tuple[bool, float | None]:
@@ -57,6 +64,11 @@ def _get_ltx_triton_context() -> tuple[bool, float | None]:
     )
 
 
+def get_calibration_counters() -> "torch.Tensor | None":
+    """Return accumulated calibration counters ``[num_thresholds, 2]`` or None."""
+    return getattr(_thread_local, "calibration_counters", None)
+
+
 def _ltx_triton_attention(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -65,22 +77,16 @@ def _ltx_triton_attention(
     mask: torch.Tensor | None = None,
     threshold: float | None = None,
 ) -> torch.Tensor:
-    """Triton FA attention on LTX-2 layout ``[B, T, H*D]``.
-
-    Converts from LTX-2's fused-head layout to the Triton kernel's varlen
-    format, calls the kernel with skip-softmax, and converts back.
-    """
+    """Triton FA attention on LTX-2 layout ``[B, T, H*D]``."""
     b, seq_q, dim_total = q.shape
     dim_head = dim_total // heads
     seq_k = k.shape[1]
     device = q.device
 
-    # LTX-2 layout: [B, T, H*D] -> reshape to [B, T, H, D] -> flat [B*T, H, D]
     q_flat = q.view(b, seq_q, heads, dim_head).reshape(b * seq_q, heads, dim_head).contiguous()
     k_flat = k.view(b, seq_k, heads, dim_head).reshape(b * seq_k, heads, dim_head).contiguous()
     v_flat = v.view(b, seq_k, heads, dim_head).reshape(b * seq_k, heads, dim_head).contiguous()
 
-    # Build varlen metadata
     b_start_loc_q = torch.arange(b, device=device, dtype=torch.int32) * seq_q
     b_seq_len_q = torch.full((b,), seq_q, device=device, dtype=torch.int32)
 
@@ -90,11 +96,10 @@ def _ltx_triton_attention(
         "b_start_loc": b_start_loc_q,
         "b_seq_len": b_seq_len_q,
         "max_input_len": seq_q,
-        "is_causal": False,  # Diffusion uses bidirectional attention
+        "is_causal": False,
         "softmax_scale": scale,
     }
 
-    # Handle different Q/KV sequence lengths
     if seq_q != seq_k:
         b_start_loc_k = torch.arange(b, device=device, dtype=torch.int32) * seq_k
         b_seq_len_k = torch.full((b,), seq_k, device=device, dtype=torch.int32)
@@ -102,35 +107,37 @@ def _ltx_triton_attention(
         kw["b_seq_len_k"] = b_seq_len_k
         kw["max_input_len_k"] = seq_k
 
-    # Skip-softmax threshold
+    # --- Calibration mode ---
+    calib_mode = getattr(_thread_local, "calibration_mode", False)
+    if calib_mode:
+        trials = getattr(_thread_local, "threshold_trials", None)
+        if trials and attention_calibrate is not None:
+            o, counters = attention_calibrate(q_flat, k_flat, v_flat, **kw, threshold_trials=trials)
+
+            prev = getattr(_thread_local, "calibration_counters", None)
+            if prev is None:
+                _thread_local.calibration_counters = counters
+            else:
+                _thread_local.calibration_counters = prev + counters
+
+            return o.view(b, seq_q, heads * dim_head)
+
+    # --- Inference mode ---
     if threshold is not None and threshold > 0.0:
         kw["skip_softmax_threshold"] = threshold
 
     assert attention is not None, "Triton attention kernel not available (requires CUDA + triton)"
     o = attention(q_flat, k_flat, v_flat, **kw)
-
-    # Reshape back: [B*T, H, D] -> [B, T, H*D]
     return o.view(b, seq_q, heads * dim_head)
 
 
 class _TritonLTXAttentionWrapper:
-    """Wraps an ``attention_function`` callable from ltx_core.
-
-    When the thread-local Triton skip-softmax flag is active, routes to the
-    Triton FA kernel.  Otherwise calls the original function.
-    """
+    """Wraps ltx_core attention_function for Triton dispatch."""
 
     def __init__(self, original_fn):
         self._original_fn = original_fn
 
-    def __call__(
-        self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        heads: int,
-        mask: torch.Tensor | None = None,
-    ) -> torch.Tensor:
+    def __call__(self, q, k, v, heads, mask=None):
         active, threshold = _get_ltx_triton_context()
         if active:
             return _ltx_triton_attention(q, k, v, heads, mask, threshold)
@@ -138,10 +145,7 @@ def __call__(
 
 
 def register_ltx_triton_attention(model: torch.nn.Module) -> None:
-    """Walk *model* and patch all ``ltx_core.Attention`` modules for Triton dispatch.
-
-    Safe to call multiple times -- already-wrapped modules are skipped.
-    """
+    """Patch all ``ltx_core.Attention`` modules for Triton dispatch."""
     for module in model.modules():
         if isinstance(module, Attention):
             fn = module.attention_function
diff --git a/modelopt/torch/sparsity/attention_sparsity/methods/registry.py b/modelopt/torch/sparsity/attention_sparsity/methods/registry.py
index 8037146643..3cb4f9010e 100644
--- a/modelopt/torch/sparsity/attention_sparsity/methods/registry.py
+++ b/modelopt/torch/sparsity/attention_sparsity/methods/registry.py
@@ -40,6 +40,10 @@ def __init__(self):
         # Video shape for VSA (T, H, W). None for non-VSA methods.
         self.video_shape: tuple[int, int, int] | None = None
 
+    def set_calibration_mode(self, enabled: bool) -> None:
+        """Enable or disable calibration mode (called by DynamicThresholdCalibrator)."""
+        self._calibration_mode = enabled
+
     def forward_attention(
         self,
         query: torch.Tensor,
diff --git a/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py b/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
index b885eeaea5..b55af1f01f 100644
--- a/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
+++ b/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
@@ -13,7 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Skip-softmax method for attention via Triton kernel tile skipping."""
+"""Skip-softmax method for attention via Triton kernel tile skipping.
+
+Supports two modes:
+- **Inference**: KV tiles with negligible scores are skipped in-kernel.
+- **Calibration**: The Triton calibration kernel collects multi-threshold
+  sparsity statistics without skipping any tiles.
+"""
 
 from contextlib import contextmanager
 
@@ -41,6 +47,8 @@ def __init__(self, method_config=None):
         super().__init__()
         method_config = method_config or {}
         self.skip_softmax_threshold = method_config.get("skip_softmax_threshold", 0.1)
+        # Calibration state
+        self._threshold_trials: list[float] | None = None
 
     @property
     def name(self) -> str:
@@ -60,14 +68,155 @@ def apply_sparsity(self, attention_scores, sparse_mask=None):
         )
 
     def get_sparse_context(self, module):
-        """Return context manager that activates skip-softmax during forward."""
+        """Return context manager that activates skip-softmax during forward.
+
+        In calibration mode, configures the Triton backend to use the
+        calibration kernel which collects multi-threshold sparsity stats.
+        In inference mode, sets the skip threshold for tile skipping.
+        """
+        if self._calibration_mode and self._threshold_trials:
+            return self._triton_calibration_context(module)
+        return self._triton_inference_context(module)
+
+    @contextmanager
+    def _triton_inference_context(self, module):
+        """Inference: activate skip-softmax with calibrated or fixed threshold."""
+        module._apply_skip_softmax = True
+
+        # Set threshold on Triton backends
+        threshold = self._get_effective_threshold(module)
+        self._set_triton_backends(threshold=threshold)
+        with self._get_diffusers_backend_context():
+            try:
+                yield
+            finally:
+                module._apply_skip_softmax = False
+                self._clear_triton_backends()
 
-        @contextmanager
-        def _skip_softmax_context():
-            module._apply_skip_softmax = True
+    @contextmanager
+    def _triton_calibration_context(self, module):
+        """Calibration: collect multi-threshold sparsity stats via Triton kernel."""
+        module._apply_skip_softmax = True
+        self._set_triton_backends(calibration_mode=True, threshold_trials=self._threshold_trials)
+        with self._get_diffusers_backend_context():
             try:
                 yield
+                # After forward pass, extract counters and build stats
+                self._collect_calibration_stats(module)
             finally:
                 module._apply_skip_softmax = False
+                self._clear_triton_backends()
+
+    def _get_effective_threshold(self, module) -> float:
+        """Compute threshold from calibration params or use fixed value."""
+        if self.calibration_params and self.target_sparse_ratio:
+            import math
+
+            params = self.calibration_params.get("prefill", {})
+            a = params.get("a", 0)
+            b = params.get("b", 0)
+            target = self.target_sparse_ratio.get("prefill", 0.5)
+            if a > 0 and b > 0:
+                # scale_factor = a * exp(b * target_sparsity)
+                # threshold = scale_factor / seqlen
+                # For diffusion with fixed seqlen, use a representative value.
+                # The actual seqlen adaptation happens at the kernel level.
+                scale_factor = a * math.exp(b * target)
+                # Use a default seqlen estimate; the kernel threshold is in
+                # absolute space so we just pass the raw threshold.
+                return scale_factor / 4224  # TODO: pass actual seqlen at runtime
+        return self.skip_softmax_threshold
+
+    @staticmethod
+    @contextmanager
+    def _get_diffusers_backend_context():
+        """Activate the modelopt_triton diffusers backend if registered."""
+        try:
+            from ..kernels.diffusers_triton_attention import get_triton_attention_backend
+
+            with get_triton_attention_backend():
+                yield
+        except (ImportError, RuntimeError):
+            yield
+
+    def _set_triton_backends(self, **kwargs):
+        """Set config on both diffusers and LTX Triton backends."""
+        try:
+            from ..kernels.diffusers_triton_attention import set_triton_skip_softmax_config
+
+            set_triton_skip_softmax_config(**kwargs)
+        except ImportError:
+            pass
+        try:
+            from ..kernels.ltx_triton_attention import set_ltx_triton_context
+
+            set_ltx_triton_context(active=True, **kwargs)
+        except ImportError:
+            pass
+
+    def _clear_triton_backends(self):
+        """Clear config on both Triton backends."""
+        try:
+            from ..kernels.diffusers_triton_attention import clear_triton_skip_softmax_config
+
+            clear_triton_skip_softmax_config()
+        except ImportError:
+            pass
+        try:
+            from ..kernels.ltx_triton_attention import clear_ltx_triton_context
+
+            clear_ltx_triton_context()
+        except ImportError:
+            pass
+
+    def _collect_calibration_stats(self, module):
+        """Read Triton calibration counters and store as stats on the module."""
+        counters = None
+
+        try:
+            from ..kernels.diffusers_triton_attention import get_calibration_counters
+
+            counters = get_calibration_counters()
+        except ImportError:
+            pass
+
+        if counters is None:
+            try:
+                from ..kernels.ltx_triton_attention import get_calibration_counters
+
+                counters = get_calibration_counters()
+            except ImportError:
+                pass
+
+        if counters is None or self._threshold_trials is None:
+            return
+
+        # counters: [num_thresholds, 2] — [:, 0]=total, [:, 1]=skipped
+        total = counters[:, 0].float()
+        skipped = counters[:, 1].float()
+        sparsity_list = (skipped / total.clamp(min=1)).tolist()
+
+        # Estimate sample_length from total tiles:
+        # total_tiles = num_heads * num_q_tiles * num_kv_tiles * batch
+        # For simplicity, use total[0] as a proxy for sequence length scaling
+        sample_length = int(total[0].item())
+
+        module._last_stats = {
+            "sparsity": sparsity_list,
+            "sample_length": sample_length,
+            "phase": "prefill",
+        }
 
-        return _skip_softmax_context()
+    def get_threshold_info(self) -> dict:
+        """Get threshold information for debugging/display."""
+        if self.calibration_params and self.target_sparse_ratio:
+            return {
+                "type": "dynamic_calibrated",
+                "formula": "threshold = a * exp(b * target_sparsity) / seqlen",
+                "calibration_params": self.calibration_params,
+                "target_sparse_ratio": self.target_sparse_ratio,
+            }
+        return {
+            "type": "static",
+            "value": self.skip_softmax_threshold,
+        }
diff --git a/modelopt/torch/sparsity/attention_sparsity/stats_manager.py b/modelopt/torch/sparsity/attention_sparsity/stats_manager.py
index de70c3cadf..3b8d9e2b92 100644
--- a/modelopt/torch/sparsity/attention_sparsity/stats_manager.py
+++ b/modelopt/torch/sparsity/attention_sparsity/stats_manager.py
@@ -66,12 +66,13 @@ def collect(self, stats: dict):
         self.aggregated_stats["total_calls"] += 1
         self.aggregated_stats["total_blocks"] += stats.get("total_blocks", 0)
 
-        incoming = stats["sparse_blocks"]
-        if "sparse_blocks" not in self.aggregated_stats:
-            self.aggregated_stats["sparse_blocks"] = list(incoming)
-        else:
-            for i, val in enumerate(incoming):
-                self.aggregated_stats["sparse_blocks"][i] += val
+        incoming = stats.get("sparse_blocks")
+        if incoming is not None:
+            if "sparse_blocks" not in self.aggregated_stats:
+                self.aggregated_stats["sparse_blocks"] = list(incoming)
+            else:
+                for i, val in enumerate(incoming):
+                    self.aggregated_stats["sparse_blocks"][i] += val
 
         phase = stats.get("phase", "unknown")
         if phase in self.aggregated_stats["phase_counts"]:

From 8aa32cc22321018f72d7fe9b015b4dc2f7de1378 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Tue, 7 Apr 2026 19:58:53 +0000
Subject: [PATCH 11/21] ADd the readme

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 examples/diffusers/sparsity/README.md | 123 ++++++++++++++++++++++++++
 1 file changed, 123 insertions(+)
 create mode 100644 examples/diffusers/sparsity/README.md

diff --git a/examples/diffusers/sparsity/README.md b/examples/diffusers/sparsity/README.md
new file mode 100644
index 0000000000..e4aea18f0d
--- /dev/null
+++ b/examples/diffusers/sparsity/README.md
@@ -0,0 +1,123 @@
+# Skip-Softmax Sparse Attention for Diffusion Models
+
+Skip-softmax sparse attention (BLASST) skips KV tiles whose attention scores
+are negligible during the FlashAttention computation, reducing FLOPs without
+retraining. An exponential model (`scale_factor = a * exp(b * target_sparsity)`)
+is calibrated once, then the target sparsity can be adjusted at runtime without
+recalibration.
+
+## Changes from Main Branch
+
+### Core Triton Kernel (`modelopt/torch/kernels/`)
+
+| File | Change |
+|------|--------|
+| `triton_fa.py` | Added `_attn_fwd_calibrate` kernel: computes full attention while measuring skip decisions for multiple thresholds via atomic counters. Added `attention_calibrate()` Python API. |
+| `__init__.py` | Export `attention_calibrate` alongside `attention`. |
+
+The kernel has two modes:
+- **Inference** (`_attn_fwd`): Autotuned, single threshold, actual tile skipping.
+- **Calibration** (`_attn_fwd_calibrate`): Fixed block sizes (128×64), multi-threshold measurement, no skipping (full attention output).
+
+### Sparse Attention Methods (`modelopt/torch/sparsity/attention_sparsity/methods/`)
+
+| File | Change |
+|------|--------|
+| `triton_skip_softmax.py` | Extended with calibration support: `_triton_calibration_context()` sets Triton calibration mode and collects counters; `_triton_inference_context()` activates diffusers backend with calibrated threshold; `_get_diffusers_backend_context()` activates `modelopt_triton` attention backend. |
+| `flash_skip_softmax.py` | Enhanced `get_sparse_context()` with `ExitStack` to also activate diffusers eager backend for calibration. |
+| `registry.py` | Added `set_calibration_mode()` to base `SparseAttentionMethod` class. |
+| `__init__.py` | Updated imports. |
+
+### Kernel Backends (`modelopt/torch/sparsity/attention_sparsity/kernels/`)
+
+| File | Change |
+|------|--------|
+| `__init__.py` | Added thread-local context (`set_skip_softmax_context` / `get_skip_softmax_context`), lazy imports for diffusers/LTX backends with `contextlib.suppress(ImportError, RuntimeError)`. |
+| `diffusers_triton_attention.py` | **New.** Registers `modelopt_triton` backend in diffusers. Two modes: inference calls `attention()`, calibration calls `attention_calibrate()`. Accumulates counters across attention calls. |
+| `diffusers_eager_attention.py` | **New.** Registers `modelopt_skip_softmax` eager backend for LLM calibration (explicit `F.softmax` for patching). |
+| `ltx_triton_attention.py` | **New.** Patches `ltx_core.Attention` modules for Triton dispatch. Supports calibration and inference modes. |
+| `ltx_eager_attention.py` | **New.** Patches `ltx_core.Attention` for eager attention calibration. |
+
+### Calibration (`modelopt/torch/sparsity/attention_sparsity/calibration/`)
+
+| File | Change |
+|------|--------|
+| `calibrate.py` | Skip RULER dataset generation when user provides `forward_loop` (required for diffusion models). Guard `from transformers import AutoTokenizer` as lazy import. |
+| `calibrator.py` | `_set_thresholds()` detects method type — sets `_threshold_trials` for `triton_skip_softmax`, `thresholds` for `flash_skip_softmax`. |
+
+### Conversion & Config
+
+| File | Change |
+|------|--------|
+| `conversion.py` | Added `_register_diffusers_backends_if_needed()` — auto-registers diffusers/LTX backends on `sparsify()`. Updated export config and summary display. |
+| `config.py` | Added `skip_softmax_threshold` field to `SparseAttentionAttributeConfig`. |
+| `plugins/huggingface.py` | Added diffusers `ModelMixin` support in `_is_supported_model()`. Lazy `import transformers`. |
+| `stats_manager.py` | Made `sparse_blocks` optional in `collect()`. Preserve `normalized_gaps` in calibration stats. |
+| `sparse_attention.py` | (Changes from main for VSA support also present.) |
+
+### Example Scripts
+
+| File | Description |
+|------|-------------|
+| `wan22_skip_softmax.py` | **New.** Wan 2.2 text-to-video with skip-softmax. Supports 5B (single transformer) and 14B (dual transformer). Uses `triton_skip_softmax` with Triton calibration kernel. Calibration prompts from OpenVid-1M. |
+
+### Tests
+
+| File | Description |
+|------|-------------|
+| `test_kernel_backends.py` | **New.** Unit tests for diffusers kernel backends with mocked dependencies (no GPU required). |
+
+## Usage
+
+```bash
+# Wan 2.2 5B — calibrate + generate
+python wan22_skip_softmax.py \
+    --model-path Wan-AI/Wan2.2-TI2V-5B-Diffusers \
+    --calibrate --target-sparsity 0.5 --calib-size 4 \
+    --calib-frames 151 --calib-steps 40 \
+    --prompt "A cat sitting on a windowsill" --output out.mp4
+
+# Wan 2.2 14B — both transformers sparsified
+python wan22_skip_softmax.py \
+    --model-path Wan-AI/Wan2.2-T2V-A14B-Diffusers \
+    --calibrate --target-sparsity 0.5 --calib-size 4 \
+    --calib-frames 151 --calib-steps 40 \
+    --prompt "A sunset over mountains" --output out.mp4
+
+# Calibrate only (no video generation)
+python wan22_skip_softmax.py \
+    --model-path Wan-AI/Wan2.2-TI2V-5B-Diffusers \
+    --calibrate --target-sparsity 0.5 --calib-size 4
+```
+
+## Architecture
+
+```text
+mtsa.sparsify(transformer, config, forward_loop)
+  │
+  ├─ apply_mode() → replace attention with SparseAttentionModule
+  │
+  └─ calibrate()
+       │
+       ├─ DynamicThresholdCalibrator._set_thresholds()
+       │    └─ sets method._threshold_trials = [1e-6, ..., 9.9e-1]
+       │
+       ├─ forward_loop(model)
+       │    │
+       │    └─ SparseAttentionModule.forward()
+       │         │
+       │         └─ triton_skip_softmax._triton_calibration_context()
+       │              ├─ set_triton_skip_softmax_config(calibration_mode=True)
+       │              ├─ attention_backend("modelopt_triton")
+       │              ├─ _diffusers_triton_attention() → attention_calibrate()
+       │              │    └─ _attn_fwd_calibrate kernel (full attn + atomic counters)
+       │              └─ _collect_calibration_stats() → module._last_stats
+       │
+       ├─ Fit: scale_factor = a * exp(b * sparsity)
+       │
+       └─ Apply a, b to all modules
+            │
+            └─ Inference: triton_skip_softmax._triton_inference_context()
+                 ├─ threshold = a * exp(b * target) / seqlen
+                 └─ attention() with skip_softmax_threshold → actual tile skipping
+```

From fbeabcf288c3599fa23759a6ecd3588c1eeb44b2 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Tue, 7 Apr 2026 20:18:55 +0000
Subject: [PATCH 12/21] Update the example script

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 .../diffusers/sparsity/wan22_skip_softmax.py  | 76 ++++++++++++++-----
 1 file changed, 56 insertions(+), 20 deletions(-)

diff --git a/examples/diffusers/sparsity/wan22_skip_softmax.py b/examples/diffusers/sparsity/wan22_skip_softmax.py
index 6c6e491a25..2170f46249 100644
--- a/examples/diffusers/sparsity/wan22_skip_softmax.py
+++ b/examples/diffusers/sparsity/wan22_skip_softmax.py
@@ -50,6 +50,16 @@
 
 DEFAULT_MODEL_PATH = os.environ.get("WAN22_MODEL_PATH", "Wan-AI/Wan2.2-TI2V-5B-Diffusers")
 
+# fmt: off
+# ruff: noqa: RUF001
+DEFAULT_NEGATIVE_PROMPT = (  # Official Wan 2.2 negative prompt (Chinese)
+    "色调艳丽，过曝，静态，细节模糊不清，字幕，风格，作品，画作，画面，静止，整体发灰，"
+    "最差质量，低质量，JPEG压缩残留，丑陋的，残缺的，多余的手指，画得不好的手部，"
+    "画得不好的脸部，畸形的，毁容的，形态畸形的肢体，手指融合，静止不动的画面，"
+    "杂乱的背景，三条腿，背景人很多，倒着走"
+)
+# fmt: on
+
 # Default threshold trials for calibration
 DEFAULT_THRESHOLD_TRIALS = [
     1e-6,
@@ -93,9 +103,21 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument("--height", type=int, default=480, help="Video height")
     parser.add_argument("--width", type=int, default=832, help="Video width")
-    parser.add_argument("--num-steps", type=int, default=50, help="Number of inference steps")
+    parser.add_argument("--num-steps", type=int, default=40, help="Number of inference steps")
+    parser.add_argument(
+        "--guidance-scale", type=float, default=4.0, help="Classifier-free guidance scale"
+    )
     parser.add_argument(
-        "--guidance-scale", type=float, default=5.0, help="Classifier-free guidance scale"
+        "--guidance-scale-2",
+        type=float,
+        default=3.0,
+        help="Second guidance scale for 14B dual-transformer model (ignored by 5B)",
+    )
+    parser.add_argument(
+        "--negative-prompt",
+        type=str,
+        default=DEFAULT_NEGATIVE_PROMPT,
+        help="Negative prompt",
     )
     parser.add_argument("--seed", type=int, default=42, help="Random seed")
 
@@ -207,6 +229,9 @@ def build_calibration_forward_loop(
     height: int = 480,
     width: int = 832,
     seed: int = 42,
+    guidance_scale: float = 4.0,
+    guidance_scale_2: float | None = 3.0,
+    negative_prompt: str = "",
 ):
     """Build a forward loop for exponential model calibration.
 
@@ -218,15 +243,19 @@ def build_calibration_forward_loop(
     def forward_loop(model):
         for i, prompt in enumerate(calib_prompts):
             print(f"Calibration [{i + 1}/{len(calib_prompts)}]: {prompt[:60]}...")
-            pipe(
-                prompt=prompt,
-                num_frames=num_frames,
-                height=height,
-                width=width,
-                num_inference_steps=num_steps,
-                guidance_scale=5.0,
-                generator=torch.Generator(device="cuda").manual_seed(seed),
-            )
+            kw: dict = {
+                "prompt": prompt,
+                "negative_prompt": negative_prompt,
+                "num_frames": num_frames,
+                "height": height,
+                "width": width,
+                "num_inference_steps": num_steps,
+                "guidance_scale": guidance_scale,
+                "generator": torch.Generator(device="cuda").manual_seed(seed),
+            }
+            if guidance_scale_2 is not None:
+                kw["guidance_scale_2"] = guidance_scale_2
+            pipe(**kw)
 
     return forward_loop
 
@@ -284,6 +313,9 @@ def main() -> None:
             height=args.height,
             width=args.width,
             seed=args.seed,
+            guidance_scale=args.guidance_scale,
+            guidance_scale_2=args.guidance_scale_2,
+            negative_prompt=args.negative_prompt,
         )
 
     # ---- Sparsify each transformer ----
@@ -296,15 +328,19 @@ def main() -> None:
     # ---- Generate (optional) ----
     if args.prompt:
         print(f"Generating: {args.prompt[:80]}...")
-        output = pipe(
-            prompt=args.prompt,
-            num_frames=args.num_frames,
-            height=args.height,
-            width=args.width,
-            num_inference_steps=args.num_steps,
-            guidance_scale=args.guidance_scale,
-            generator=torch.Generator(device="cuda").manual_seed(args.seed),
-        )
+        pipe_kwargs: dict = {
+            "prompt": args.prompt,
+            "negative_prompt": args.negative_prompt,
+            "num_frames": args.num_frames,
+            "height": args.height,
+            "width": args.width,
+            "num_inference_steps": args.num_steps,
+            "guidance_scale": args.guidance_scale,
+            "generator": torch.Generator(device="cuda").manual_seed(args.seed),
+        }
+        if args.guidance_scale_2 is not None:
+            pipe_kwargs["guidance_scale_2"] = args.guidance_scale_2
+        output = pipe(**pipe_kwargs)
 
         export_to_video(output.frames[0], args.output, fps=16)
         print(f"Saved to {args.output}")

From 6a4ab8bf86ff9618ca3ea9ea7a1d3a27b25fee20 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Tue, 7 Apr 2026 14:08:59 -0700
Subject: [PATCH 13/21] Update the code

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 .../diffusers/sparsity/wan22_skip_softmax.py  |  2 +-
 .../kernels/diffusers_triton_attention.py     | 31 ++++++++--
 .../kernels/ltx_triton_attention.py           | 28 +++++++--
 .../methods/triton_skip_softmax.py            | 57 ++++++++++++-------
 4 files changed, 85 insertions(+), 33 deletions(-)

diff --git a/examples/diffusers/sparsity/wan22_skip_softmax.py b/examples/diffusers/sparsity/wan22_skip_softmax.py
index 2170f46249..f52e53639b 100644
--- a/examples/diffusers/sparsity/wan22_skip_softmax.py
+++ b/examples/diffusers/sparsity/wan22_skip_softmax.py
@@ -19,7 +19,7 @@
 generation model (text-to-video) using exponential model calibration
 (``scale_factor = a * exp(b * target_sparsity)``).
 
-During calibration, ``flash_skip_softmax`` with the eager attention backend
+During calibration, ``triton_skip_softmax`` with the Triton calibration kernel
 collects sparsity statistics across multiple threshold trials. The fitted
 exponential model then allows runtime control of the target sparsity ratio
 without recalibration.
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
index 2e44d41258..8de08e8cb5 100644
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
@@ -49,21 +49,27 @@ def set_triton_skip_softmax_config(
     threshold: float | None = None,
     calibration_mode: bool = False,
     threshold_trials: list[float] | None = None,
+    scale_factor: float | None = None,
 ) -> None:
     """Set thread-local skip-softmax config for the next Triton attention call.
 
     Args:
-        threshold: Skip-softmax threshold for inference mode.
+        threshold: Skip-softmax threshold for inference mode (static).
         calibration_mode: If True, use the calibration kernel to collect
             multi-threshold sparsity stats instead of skipping tiles.
         threshold_trials: List of thresholds to measure sparsity for
             (only used when calibration_mode=True).
+        scale_factor: Calibrated scale factor for dynamic threshold computation.
+            When set, the actual threshold is computed as ``scale_factor / seq_k``
+            at attention call time, adapting to the actual sequence length.
     """
     _thread_local.skip_threshold = threshold
     _thread_local.calibration_mode = calibration_mode
     _thread_local.threshold_trials = threshold_trials
+    _thread_local.scale_factor = scale_factor
     # Accumulated counters across all attention calls in one forward pass
     _thread_local.calibration_counters = None
+    _thread_local.calibration_seq_k = None
 
 
 def clear_triton_skip_softmax_config() -> None:
@@ -71,7 +77,9 @@ def clear_triton_skip_softmax_config() -> None:
     _thread_local.skip_threshold = None
     _thread_local.calibration_mode = False
     _thread_local.threshold_trials = None
+    _thread_local.scale_factor = None
     _thread_local.calibration_counters = None
+    _thread_local.calibration_seq_k = None
 
 
 def get_calibration_counters() -> "torch.Tensor | None":
@@ -79,6 +87,11 @@ def get_calibration_counters() -> "torch.Tensor | None":
     return getattr(_thread_local, "calibration_counters", None)
 
 
+def get_calibration_seq_k() -> int | None:
+    """Return KV sequence length observed during calibration, or None."""
+    return getattr(_thread_local, "calibration_seq_k", None)
+
+
 # ---------------------------------------------------------------------------
 # Triton attention implementation for diffusers layout
 # ---------------------------------------------------------------------------
@@ -140,12 +153,20 @@ def _diffusers_triton_attention(
             else:
                 _thread_local.calibration_counters = prev + counters
 
+            # Store actual KV sequence length for calibration stats
+            _thread_local.calibration_seq_k = seq_k
+
             return o.view(batch, seq_q, num_heads_q, head_dim)
 
-    # --- Inference mode: skip-softmax with single threshold ---
-    threshold = getattr(_thread_local, "skip_threshold", None)
-    if threshold is not None and threshold > 0.0:
-        kw["skip_softmax_threshold"] = threshold
+    # --- Inference mode: skip-softmax with dynamic or static threshold ---
+    scale_factor = getattr(_thread_local, "scale_factor", None)
+    if scale_factor is not None and scale_factor > 0.0:
+        # Dynamic threshold: adapt to actual sequence length
+        kw["skip_softmax_threshold"] = scale_factor / seq_k
+    else:
+        threshold = getattr(_thread_local, "skip_threshold", None)
+        if threshold is not None and threshold > 0.0:
+            kw["skip_softmax_threshold"] = threshold
 
     assert attention is not None, "Triton attention kernel not available (requires CUDA + triton)"
     o = attention(q, k, v, **kw)
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
index 8ef2569d5b..a8d2e6e8db 100644
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
@@ -37,14 +37,17 @@ def set_ltx_triton_context(
     threshold: float | None = None,
     calibration_mode: bool = False,
     threshold_trials: list[float] | None = None,
+    scale_factor: float | None = None,
 ) -> None:
     """Set thread-local Triton config for LTX-2 attention."""
     _thread_local.active = active
     _thread_local.threshold = threshold
     _thread_local.calibration_mode = calibration_mode
     _thread_local.threshold_trials = threshold_trials
+    _thread_local.scale_factor = scale_factor
     if not calibration_mode:
         _thread_local.calibration_counters = None
+    _thread_local.calibration_seq_k = None
 
 
 def clear_ltx_triton_context() -> None:
@@ -53,14 +56,17 @@ def clear_ltx_triton_context() -> None:
     _thread_local.threshold = None
     _thread_local.calibration_mode = False
     _thread_local.threshold_trials = None
+    _thread_local.scale_factor = None
     _thread_local.calibration_counters = None
+    _thread_local.calibration_seq_k = None
 
 
-def _get_ltx_triton_context() -> tuple[bool, float | None]:
-    """Return (active, threshold)."""
+def _get_ltx_triton_context() -> tuple[bool, float | None, float | None]:
+    """Return (active, threshold, scale_factor)."""
     return (
         getattr(_thread_local, "active", False),
         getattr(_thread_local, "threshold", None),
+        getattr(_thread_local, "scale_factor", None),
     )
 
 
@@ -69,6 +75,11 @@ def get_calibration_counters() -> "torch.Tensor | None":
     return getattr(_thread_local, "calibration_counters", None)
 
 
+def get_calibration_seq_k() -> int | None:
+    """Return KV sequence length observed during calibration, or None."""
+    return getattr(_thread_local, "calibration_seq_k", None)
+
+
 def _ltx_triton_attention(
     q: torch.Tensor,
     k: torch.Tensor,
@@ -120,10 +131,17 @@ def _ltx_triton_attention(
             else:
                 _thread_local.calibration_counters = prev + counters
 
+            # Store actual KV sequence length for calibration stats
+            _thread_local.calibration_seq_k = seq_k
+
             return o.view(b, seq_q, heads * dim_head)
 
-    # --- Inference mode ---
-    if threshold is not None and threshold > 0.0:
+    # --- Inference mode: dynamic or static threshold ---
+    scale_factor = getattr(_thread_local, "scale_factor", None)
+    if scale_factor is not None and scale_factor > 0.0:
+        # Dynamic threshold: adapt to actual sequence length
+        kw["skip_softmax_threshold"] = scale_factor / seq_k
+    elif threshold is not None and threshold > 0.0:
         kw["skip_softmax_threshold"] = threshold
 
     assert attention is not None, "Triton attention kernel not available (requires CUDA + triton)"
@@ -138,7 +156,7 @@ def __init__(self, original_fn):
         self._original_fn = original_fn
 
     def __call__(self, q, k, v, heads, mask=None):
-        active, threshold = _get_ltx_triton_context()
+        active, threshold, scale_factor = _get_ltx_triton_context()
         if active:
             return _ltx_triton_attention(q, k, v, heads, mask, threshold)
         return self._original_fn(q, k, v, heads, mask)
diff --git a/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py b/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
index b55af1f01f..c52a0ddde7 100644
--- a/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
+++ b/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
@@ -83,9 +83,14 @@ def _triton_inference_context(self, module):
         """Inference: activate skip-softmax with calibrated or fixed threshold."""
         module._apply_skip_softmax = True
 
-        # Set threshold on Triton backends
-        threshold = self._get_effective_threshold(module)
-        self._set_triton_backends(threshold=threshold)
+        # When calibrated, pass scale_factor so backends compute
+        # threshold = scale_factor / seq_k at call time (adapts to actual seqlen).
+        # Otherwise fall back to the static threshold.
+        scale_factor = self._get_scale_factor()
+        if scale_factor is not None:
+            self._set_triton_backends(scale_factor=scale_factor)
+        else:
+            self._set_triton_backends(threshold=self.skip_softmax_threshold)
         with self._get_diffusers_backend_context():
             try:
                 yield
@@ -107,8 +112,12 @@ def _triton_calibration_context(self, module):
                 module._apply_skip_softmax = False
                 self._clear_triton_backends()
 
-    def _get_effective_threshold(self, module) -> float:
-        """Compute threshold from calibration params or use fixed value."""
+    def _get_scale_factor(self) -> float | None:
+        """Compute scale_factor from calibration params, or None if uncalibrated.
+
+        The scale_factor is sequence-length-independent. Backends divide by the
+        actual ``seq_k`` at call time: ``threshold = scale_factor / seq_k``.
+        """
         if self.calibration_params and self.target_sparse_ratio:
             import math
 
@@ -117,15 +126,8 @@ def _get_effective_threshold(self, module) -> float:
             b = params.get("b", 0)
             target = self.target_sparse_ratio.get("prefill", 0.5)
             if a > 0 and b > 0:
-                # scale_factor = a * exp(b * target_sparsity)
-                # threshold = scale_factor / seqlen
-                # For diffusion with fixed seqlen, use a representative value.
-                # The actual seqlen adaptation happens at the kernel level.
-                scale_factor = a * math.exp(b * target)
-                # Use a default seqlen estimate; the kernel threshold is in
-                # absolute space so we just pass the raw threshold.
-                return scale_factor / 4224  # TODO: pass actual seqlen at runtime
-        return self.skip_softmax_threshold
+                return a * math.exp(b * target)
+        return None
 
     @staticmethod
     @contextmanager
@@ -172,19 +174,28 @@ def _clear_triton_backends(self):
     def _collect_calibration_stats(self, module):
         """Read Triton calibration counters and store as stats on the module."""
         counters = None
+        seq_k = None
 
         try:
-            from ..kernels.diffusers_triton_attention import get_calibration_counters
+            from ..kernels.diffusers_triton_attention import (
+                get_calibration_counters,
+                get_calibration_seq_k,
+            )
 
             counters = get_calibration_counters()
+            seq_k = get_calibration_seq_k()
         except ImportError:
             pass
 
         if counters is None:
             try:
-                from ..kernels.ltx_triton_attention import get_calibration_counters
+                from ..kernels.ltx_triton_attention import (
+                    get_calibration_counters,
+                    get_calibration_seq_k,
+                )
 
                 counters = get_calibration_counters()
+                seq_k = get_calibration_seq_k()
             except ImportError:
                 pass
 
@@ -196,10 +207,10 @@ def _collect_calibration_stats(self, module):
         skipped = counters[:, 1].float()
         sparsity_list = (skipped / total.clamp(min=1)).tolist()
 
-        # Estimate sample_length from total tiles:
-        # total_tiles = num_heads * num_q_tiles * num_kv_tiles * batch
-        # For simplicity, use total[0] as a proxy for sequence length scaling
-        sample_length = int(total[0].item())
+        # Use actual KV sequence length from backend for the exponential model fit.
+        # The calibrator uses: scale_factor = threshold * sample_length, so this
+        # must be the real sequence length, not the total tile count.
+        sample_length = seq_k if seq_k is not None else 0
 
         module._last_stats = {
             "sparsity": sparsity_list,
@@ -209,10 +220,12 @@ def _collect_calibration_stats(self, module):
 
     def get_threshold_info(self) -> dict:
         """Get threshold information for debugging/display."""
-        if self.calibration_params and self.target_sparse_ratio:
+        scale_factor = self._get_scale_factor()
+        if scale_factor is not None:
             return {
                 "type": "dynamic_calibrated",
-                "formula": "threshold = a * exp(b * target_sparsity) / seqlen",
+                "formula": "threshold = scale_factor / seq_k (computed at runtime)",
+                "scale_factor": scale_factor,
                 "calibration_params": self.calibration_params,
                 "target_sparse_ratio": self.target_sparse_ratio,
             }

From d7dd15c59041c553644e37c8eaeb60a593a2448a Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Wed, 8 Apr 2026 15:19:58 -0700
Subject: [PATCH 14/21] Update the calibration loop

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 .../diffusers/sparsity/wan22_skip_softmax.py  | 210 ++++++++++++++----
 modelopt/torch/kernels/triton_fa.py           | 152 ++++++++++---
 .../sparsity/attention_sparsity/config.py     |  11 +
 .../kernels/diffusers_triton_attention.py     |  55 ++++-
 .../kernels/ltx_triton_attention.py           |  14 +-
 .../methods/triton_skip_softmax.py            |  59 ++++-
 6 files changed, 408 insertions(+), 93 deletions(-)

diff --git a/examples/diffusers/sparsity/wan22_skip_softmax.py b/examples/diffusers/sparsity/wan22_skip_softmax.py
index f52e53639b..95f42cb3ef 100644
--- a/examples/diffusers/sparsity/wan22_skip_softmax.py
+++ b/examples/diffusers/sparsity/wan22_skip_softmax.py
@@ -16,8 +16,15 @@
 """Wan 2.2 inference with skip-softmax sparse attention.
 
 This example applies skip-softmax sparse attention to the Wan 2.2 video
-generation model (text-to-video) using exponential model calibration
-(``scale_factor = a * exp(b * target_sparsity)``).
+generation model (text-to-video). Three modes are supported:
+
+1. **Baseline** — pass ``--baseline`` for dense inference (default diffusers backend).
+2. **Triton baseline** — pass ``--triton-baseline`` for dense Triton FA kernel
+   (no skip-softmax, same kernel as sparse runs for apples-to-apples comparison).
+3. **Fixed raw threshold** — pass ``--raw-threshold`` to supply a log2-space
+   threshold directly to the Triton kernel. No calibration data is needed.
+4. **Calibrated threshold** — pass ``--calibrate`` to run exponential-model
+   calibration (``scale_factor = a * exp(b * target_sparsity)``).
 
 During calibration, ``triton_skip_softmax`` with the Triton calibration kernel
 collects sparsity statistics across multiple threshold trials. The fitted
@@ -29,13 +36,17 @@
 
 Usage::
 
-    # With calibration (recommended)
-    python wan22_skip_softmax.py --prompt "A cat playing piano" --output out.mp4 \\
-        --calibrate --target-sparsity 0.25
+    # Baseline (dense, no sparsity)
+    python wan22_skip_softmax.py --baseline --prompt "A cat playing piano" \\
+        --output baseline.mp4
+
+    # Fixed raw threshold (no calibration needed)
+    python wan22_skip_softmax.py --raw-threshold -5.0 --report-avg-sparsity \\
+        --prompt "A cat playing piano" --output out.mp4
 
-    # Custom model path
-    python wan22_skip_softmax.py --model-path /path/to/Wan2.2-T2V-5B \\
-        --prompt "A sunset over mountains" --output sunset.mp4 --calibrate
+    # With calibration
+    python wan22_skip_softmax.py --calibrate --target-sparsity 0.25 \\
+        --report-avg-sparsity --prompt "A cat playing piano" --output out.mp4
 """
 
 import argparse
@@ -122,12 +133,36 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument("--seed", type=int, default=42, help="Random seed")
 
     # Sparse attention options
+    parser.add_argument(
+        "--baseline",
+        action="store_true",
+        help="Run dense inference with default diffusers backend (no sparsity)",
+    )
+    parser.add_argument(
+        "--triton-baseline",
+        action="store_true",
+        help="Run dense inference with Triton FA kernel (no skip-softmax, "
+        "apples-to-apples comparison with sparse runs)",
+    )
+    parser.add_argument(
+        "--raw-threshold",
+        type=float,
+        default=None,
+        help="Raw skip_threshold_log2 value passed directly to the Triton kernel. "
+        "Negative values (e.g., -5.0 means tile must be within 5 units of running max). "
+        "Bypasses calibration and lambda conversion. Typical range: -1 to -30.",
+    )
     parser.add_argument(
         "--skip-first-last",
         type=int,
         default=2,
         help="Number of first/last transformer layers to keep dense (default: 2)",
     )
+    parser.add_argument(
+        "--report-avg-sparsity",
+        action="store_true",
+        help="Report per-layer and overall average tile sparsity after generation",
+    )
 
     # Calibration options
     parser.add_argument(
@@ -173,20 +208,26 @@ def build_pipeline(model_path: str) -> WanPipeline:
 def build_sparse_config(args: argparse.Namespace, num_blocks: int) -> dict:
     """Build sparse attention config from CLI args.
 
-    Uses triton_skip_softmax with the Triton FA kernel for both calibration
-    and inference. Calibration collects multi-threshold sparsity statistics
-    via the Triton calibration kernel, then fits an exponential model:
-    scale_factor = a * exp(b * sparsity).
+    Two modes:
+    - **Raw threshold**: ``--raw-threshold`` sets ``skip_softmax_raw_threshold``
+      directly on the Triton kernel — no calibration needed.
+    - **Calibrated**: ``--calibrate`` collects multi-threshold sparsity statistics
+      via the Triton calibration kernel, then fits an exponential model:
+      ``scale_factor = a * exp(b * sparsity)``.
     """
     attn_cfg: dict = {
         "method": "triton_skip_softmax",
-        "skip_softmax_threshold": 0.1,
+        "skip_softmax_threshold": 0.0 if args.triton_baseline else 0.1,
         "backend": "triton",
         "is_causal": False,  # Diffusion = bidirectional attention
         "collect_stats": True,
         "enable": True,
     }
 
+    # Raw threshold bypasses calibration and lambda conversion
+    if args.raw_threshold is not None:
+        attn_cfg["skip_softmax_raw_threshold"] = args.raw_threshold
+
     sparse_cfg: dict = {
         "*.attn1*": attn_cfg,  # Self-attention only
         "*.attn2*": {"enable": False},  # Text cross-attention
@@ -200,8 +241,8 @@ def build_sparse_config(args: argparse.Namespace, num_blocks: int) -> dict:
 
     config: dict = {"sparse_cfg": sparse_cfg}
 
-    # Add calibration config with threshold trials
-    if args.calibrate:
+    # Add calibration config only when calibrating (not with raw threshold)
+    if args.calibrate and args.raw_threshold is None:
         sparse_cfg["calibration"] = {
             "target_sparse_ratio": {"prefill": args.target_sparsity},
             "samples": 1,
@@ -260,8 +301,18 @@ def forward_loop(model):
     return forward_loop
 
 
+def enable_sparsity_measurement(model: torch.nn.Module) -> None:
+    """Enable runtime sparsity measurement on all sparse attention modules."""
+    for _name, module in model.named_modules():
+        if isinstance(module, SparseAttentionModule) and module.is_enabled:
+            method = module._sparse_method_instance
+            if hasattr(method, "enable_measure_sparsity"):
+                method.reset_sparsity_counters()
+                method.enable_measure_sparsity(True)
+
+
 def print_sparsity_summary(model: torch.nn.Module) -> None:
-    """Print per-module sparsity statistics."""
+    """Print per-module sparsity statistics including runtime kernel counters."""
     enabled, disabled = [], []
     for name, module in model.named_modules():
         if isinstance(module, SparseAttentionModule):
@@ -276,6 +327,38 @@ def print_sparsity_summary(model: torch.nn.Module) -> None:
         print(f"  {name}: {info}")
 
 
+def print_runtime_sparsity(model: torch.nn.Module) -> None:
+    """Print runtime tile sparsity measured via kernel atomic counters."""
+    total_all = 0
+    skipped_all = 0
+    per_module: list[tuple[str, int, int]] = []
+
+    for name, module in model.named_modules():
+        if isinstance(module, SparseAttentionModule) and module.is_enabled:
+            method = module._sparse_method_instance
+            if hasattr(method, "get_sparsity_counters"):
+                total, skipped = method.get_sparsity_counters()
+                if total > 0:
+                    per_module.append((name, total, skipped))
+                    total_all += total
+                    skipped_all += skipped
+
+    if total_all == 0:
+        print("\nNo runtime sparsity data collected.")
+        return
+
+    print("\n" + "=" * 70)
+    print("Runtime tile sparsity (measured via kernel atomic counters)")
+    print("=" * 70)
+    for name, total, skipped in per_module:
+        ratio = skipped / total
+        print(f"  {name}: {skipped:,}/{total:,} tiles skipped ({ratio:.1%})")
+    ratio_all = skipped_all / total_all
+    print("-" * 70)
+    print(f"  Overall: {skipped_all:,}/{total_all:,} tiles skipped ({ratio_all:.1%})")
+    print("=" * 70)
+
+
 def _get_num_blocks(transformer: torch.nn.Module) -> int:
     """Count transformer blocks by looking for *.blocks.N.* submodules."""
     max_idx = -1
@@ -294,39 +377,71 @@ def main() -> None:
     print(f"Loading Wan 2.2 from {args.model_path}...")
     pipe = build_pipeline(args.model_path)
 
-    # ---- Collect transformers to sparsify ----
+    # ---- Collect transformers ----
     # Wan 2.2 5B has one transformer; 14B has two (transformer + transformer_2)
-    transformers_to_sparsify = []
+    transformers = []
     if pipe.transformer is not None:
-        transformers_to_sparsify.append(("transformer", pipe.transformer))
+        transformers.append(("transformer", pipe.transformer))
     if getattr(pipe, "transformer_2", None) is not None:
-        transformers_to_sparsify.append(("transformer_2", pipe.transformer_2))
-
-    # ---- Build calibration forward loop (shared across transformers) ----
-    forward_loop = None
-    if args.calibrate:
-        forward_loop = build_calibration_forward_loop(
-            pipe,
-            calib_size=args.calib_size,
-            num_steps=args.calib_steps,
-            num_frames=args.calib_frames,
-            height=args.height,
-            width=args.width,
-            seed=args.seed,
-            guidance_scale=args.guidance_scale,
-            guidance_scale_2=args.guidance_scale_2,
-            negative_prompt=args.negative_prompt,
-        )
-
-    # ---- Sparsify each transformer ----
-    for name, transformer in transformers_to_sparsify:
-        num_blocks = _get_num_blocks(transformer)
-        print(f"Applying skip-softmax to {name} ({num_blocks} blocks)...")
-        config = build_sparse_config(args, num_blocks=num_blocks)
-        mtsa.sparsify(transformer, config, forward_loop=forward_loop)
+        transformers.append(("transformer_2", pipe.transformer_2))
+
+    # ---- Sparsify (unless baseline) ----
+    if args.baseline:
+        print("Baseline mode: running dense inference (default diffusers backend)")
+    elif args.triton_baseline:
+        print("Triton baseline: dense Triton FA kernel (no skip-softmax)")
+        for name, transformer in transformers:
+            num_blocks = _get_num_blocks(transformer)
+            print(f"Applying Triton backend to {name} ({num_blocks} blocks)...")
+            config = build_sparse_config(args, num_blocks=num_blocks)
+            mtsa.sparsify(transformer, config, forward_loop=None)
+    else:
+        # Build calibration forward loop if needed
+        forward_loop = None
+        if args.raw_threshold is not None:
+            print(f"Using fixed raw threshold: {args.raw_threshold} (skipping calibration)")
+            if args.calibrate:
+                print("Warning: --calibrate is ignored when --raw-threshold is set")
+        elif args.calibrate:
+            forward_loop = build_calibration_forward_loop(
+                pipe,
+                calib_size=args.calib_size,
+                num_steps=args.calib_steps,
+                num_frames=args.calib_frames,
+                height=args.height,
+                width=args.width,
+                seed=args.seed,
+                guidance_scale=args.guidance_scale,
+                guidance_scale_2=args.guidance_scale_2,
+                negative_prompt=args.negative_prompt,
+            )
+        else:
+            print(
+                "Warning: neither --baseline, --raw-threshold, nor --calibrate specified; "
+                "using default static threshold"
+            )
+
+        for name, transformer in transformers:
+            num_blocks = _get_num_blocks(transformer)
+            print(f"Applying skip-softmax to {name} ({num_blocks} blocks)...")
+            config = build_sparse_config(args, num_blocks=num_blocks)
+            mtsa.sparsify(transformer, config, forward_loop=forward_loop)
+
+    # ---- Free calibration memory before inference ----
+    if not args.baseline and not args.triton_baseline and forward_loop is not None:
+        import gc
+
+        gc.collect()
+        torch.cuda.empty_cache()
+        print("Cleared CUDA cache after calibration")
 
     # ---- Generate (optional) ----
     if args.prompt:
+        # Enable runtime sparsity measurement before generation
+        if args.report_avg_sparsity and not args.baseline:
+            for _name, transformer in transformers:
+                enable_sparsity_measurement(transformer)
+
         print(f"Generating: {args.prompt[:80]}...")
         pipe_kwargs: dict = {
             "prompt": args.prompt,
@@ -346,9 +461,12 @@ def main() -> None:
         print(f"Saved to {args.output}")
 
     # ---- Print stats ----
-    for name, transformer in transformers_to_sparsify:
-        print(f"\n{name}:")
-        print_sparsity_summary(transformer)
+    if not args.baseline:
+        for name, transformer in transformers:
+            print(f"\n{name}:")
+            print_sparsity_summary(transformer)
+            if args.report_avg_sparsity:
+                print_runtime_sparsity(transformer)
 
 
 if __name__ == "__main__":
diff --git a/modelopt/torch/kernels/triton_fa.py b/modelopt/torch/kernels/triton_fa.py
index 8de2dac6d7..9ea5586705 100644
--- a/modelopt/torch/kernels/triton_fa.py
+++ b/modelopt/torch/kernels/triton_fa.py
@@ -252,6 +252,9 @@ def _attn_fwd(
     DENSE_WINDOW_SIZE: tl.constexpr = 64,  # Tokens near diagonal kept dense (absolute, BLOCK_N-independent)
     APPLY_SKIP_SOFTMAX: tl.constexpr = False,  # Skip KV tiles with negligible scores
     SKIP_THRESHOLD_LOG2: tl.constexpr = 0.0,  # log2(lambda) * sm_scale, pre-scaled for comparison on scaled scores
+    Sparsity_total=None,  # Optional int64 scalar for counting total tiles (atomic)
+    Sparsity_skipped=None,  # Optional int64 scalar for counting skipped tiles (atomic)
+    MEASURE_SPARSITY: tl.constexpr = False,  # When True, count total/skipped tiles via atomic adds
 ):
     # --- Grid: (batch, num_q_heads, num_q_tiles) ---
     # Example: batch=2, num_q_heads=32, seq_len=256, BLOCK_M=128
@@ -347,6 +350,12 @@ def _attn_fwd(
             # Per-tile: skip entire tile only if ALL rows are negligible
             skip_tile = tl.min(can_skip.to(tl.int32)) == 1
 
+            # Optional runtime sparsity measurement via atomic counters
+            if MEASURE_SPARSITY:
+                tl.atomic_add(Sparsity_total, 1)  # count every tile
+                if skip_tile:
+                    tl.atomic_add(Sparsity_skipped, 1)  # count skipped tiles
+
             if not skip_tile:
                 m_new = tl.maximum(row_max, tile_row_max)
                 p = tl.math.exp2(scores - m_new[:, None])
@@ -385,7 +394,9 @@ def _attn_fwd(
             row_max = m_new
 
     # --- Final normalization: output = acc / row_sum ---
-    acc = acc / row_sum[:, None]
+    # Clamp denominator to avoid 0/0 NaN when skip-softmax skips all KV tiles.
+    # Safe because acc is also 0 in that case (never accumulated), so 0/eps = 0.
+    acc = acc / tl.maximum(row_sum[:, None], 1e-6)
 
     # Save LSE for backward pass (log2-space: lse = max + log2(sum))
     if STORE_LSE:
@@ -768,6 +779,8 @@ def forward(
         num_sink_tokens,
         dense_window_size,
         skip_softmax_threshold,
+        skip_softmax_raw_threshold,
+        measure_sparsity,
     ):
         HEAD_DIM = q.shape[2]
         num_q_heads = q.shape[1]
@@ -788,20 +801,36 @@ def forward(
         # Triton tiles must be powers of 2; pad head dim
         BLOCK_D = triton.next_power_of_2(HEAD_DIM)
 
-        # Skip-softmax: convert threshold to scaled log2 space for the kernel.
-        # The BLASST reference (https://arxiv.org/pdf/2512.12087) checks
-        # ln(lambda) on unscaled scores. Our kernel works in log2-scaled space
-        # (scores pre-multiplied by qk_scale = sm_scale * LOG2E), so we
-        # pre-scale: threshold_scaled = log2(lambda) * sm_scale.
-        apply_skip = skip_softmax_threshold is not None and skip_softmax_threshold > 0.0
-        if apply_skip:
+        # Skip-softmax threshold in scaled log2 space for the kernel.
+        # Two modes:
+        #   1. raw_threshold: passed directly as skip_threshold_log2 (for testing)
+        #   2. lambda threshold: converted via log2(lambda) * sm_scale
+        if skip_softmax_raw_threshold is not None:
+            apply_skip = True
+            skip_threshold_log2 = skip_softmax_raw_threshold
+        elif skip_softmax_threshold is not None and skip_softmax_threshold > 0.0:
+            apply_skip = True
+            # The BLASST reference (https://arxiv.org/pdf/2512.12087) checks
+            # ln(lambda) on unscaled scores. Our kernel works in log2-scaled space
+            # (scores pre-multiplied by qk_scale = sm_scale * LOG2E), so we
+            # pre-scale: threshold_scaled = log2(lambda) * sm_scale.
             skip_threshold_log2 = math.log2(skip_softmax_threshold) * sm_scale
         else:
+            apply_skip = False
             skip_threshold_log2 = 0.0
 
         o = torch.empty_like(q)
         lse = torch.empty(q.shape[0], num_q_heads, device=q.device, dtype=torch.float32)
 
+        # Optional runtime sparsity counters (single int64 scalars for atomic adds)
+        do_measure = measure_sparsity and apply_skip
+        if do_measure:
+            sparsity_total = torch.zeros(1, dtype=torch.int64, device=q.device)
+            sparsity_skipped = torch.zeros(1, dtype=torch.int64, device=q.device)
+        else:
+            sparsity_total = None
+            sparsity_skipped = None
+
         # Grid: (batch, q_heads, q_tiles). Uses a function because BLOCK_M is autotuned.
         def grid(META):
             return (batch, num_q_heads, triton.cdiv(max_input_len, META["BLOCK_M"]))
@@ -839,9 +868,17 @@ def grid(META):
             DENSE_WINDOW_SIZE=dense_window_size,
             APPLY_SKIP_SOFTMAX=apply_skip,
             SKIP_THRESHOLD_LOG2=skip_threshold_log2,
+            Sparsity_total=sparsity_total,
+            Sparsity_skipped=sparsity_skipped,
+            MEASURE_SPARSITY=do_measure,
             # BLOCK_M, BLOCK_N, num_warps, num_stages chosen by autotune
         )
 
+        # Store sparsity counters on the output tensor for retrieval by callers
+        if do_measure:
+            o._sparsity_total = sparsity_total.item()
+            o._sparsity_skipped = sparsity_skipped.item()
+
         ctx.save_for_backward(q, k, v, o, lse, b_start_loc, b_seq_len, b_start_loc_k, b_seq_len_k)
         ctx.max_input_len = max_input_len
         ctx.max_input_len_k = max_input_len_k
@@ -985,6 +1022,8 @@ def backward(ctx, grad_output):
             None,
             None,
             None,
+            None,
+            None,
         )
 
 
@@ -1006,6 +1045,8 @@ def attention(
     num_sink_tokens: int = 0,
     dense_window_size: int = 64,
     skip_softmax_threshold: float | None = None,
+    skip_softmax_raw_threshold: float | None = None,
+    measure_sparsity: bool = False,
 ) -> torch.Tensor:
     """Variable-length flash attention with GQA, autograd, and optional N:M sparse softmax and skip-softmax.
 
@@ -1037,6 +1078,16 @@ def attention(
             softmax contribution is negligible. Tiles are skipped entirely
             (no softmax, V load, or BMM2). The threshold is applied on
             unscaled scores. Set to ``None`` or ``0`` to disable.
+        skip_softmax_raw_threshold: Raw ``skip_threshold_log2`` value passed
+            directly to the kernel without conversion. The kernel skips tiles
+            where ``tile_row_max < row_max + raw_threshold``. Typical values
+            are negative (e.g., ``-5.0`` means tiles must be within 5 units of
+            the running max in the kernel's scaled score space). Takes
+            precedence over ``skip_softmax_threshold`` when both are set.
+        measure_sparsity: When True and skip-softmax is active, count total
+            and skipped tiles via atomic counters. The counts are stored as
+            ``_sparsity_total`` and ``_sparsity_skipped`` attributes on the
+            returned output tensor.
 
     Returns:
         Output tensor [total_q_tokens, num_q_heads, head_dim].
@@ -1059,6 +1110,8 @@ def attention(
         num_sink_tokens,
         dense_window_size,
         skip_softmax_threshold,
+        skip_softmax_raw_threshold,
+        measure_sparsity,
     )
 
 
@@ -1085,7 +1138,8 @@ def _attn_fwd_calibrate(
     stride_obs,
     stride_oh,
     Threshold_trials,  # [NUM_THRESHOLDS] float32 — pre-scaled to log2 space
-    Sparsity_counters,  # [NUM_THRESHOLDS * 2] int64 — [total, skipped] per threshold
+    Per_program_totals,  # [num_programs * NUM_THRESHOLDS] int32 — per-program tile counts
+    Per_program_skipped,  # [num_programs * NUM_THRESHOLDS] int32 — per-program skip counts
     kv_group_num: tl.constexpr,
     BLOCK_M: tl.constexpr,
     BLOCK_D: tl.constexpr,
@@ -1093,13 +1147,14 @@ def _attn_fwd_calibrate(
     IS_CAUSAL: tl.constexpr,
     HEAD_DIM: tl.constexpr,
     NUM_THRESHOLDS: tl.constexpr,
+    PADDED_THRESHOLDS: tl.constexpr,  # next_power_of_2(NUM_THRESHOLDS) for tl.arange
 ):
     """Forward kernel with multi-threshold sparsity measurement.
 
     Computes full attention (no skipping) while counting how many KV tiles
-    would be skipped at each threshold. Statistics are collected via atomic
-    adds to ``Sparsity_counters[t*2]`` (total tiles) and
-    ``Sparsity_counters[t*2+1]`` (skipped tiles).
+    would be skipped at each threshold. Each program writes its local counts
+    to ``Per_program_totals`` and ``Per_program_skipped``; the Python wrapper
+    sums across programs afterward. This avoids global atomic contention.
     """
     batch_idx = tl.program_id(0)
     head_idx = tl.program_id(1)
@@ -1129,6 +1184,17 @@ def _attn_fwd_calibrate(
     row_sum = tl.zeros([BLOCK_M], dtype=tl.float32)
     acc = tl.zeros([BLOCK_M, BLOCK_D], dtype=tl.float32)
 
+    # Pre-load all thresholds once (vectorized, stays in registers).
+    # tl.arange requires power-of-2 size, so use PADDED_THRESHOLDS with masking.
+    thresh_offs = tl.arange(0, PADDED_THRESHOLDS)
+    thresh_mask = thresh_offs < NUM_THRESHOLDS
+    thresholds = tl.load(Threshold_trials + thresh_offs, mask=thresh_mask, other=float("inf"))
+
+    # Per-program local counters: avoid global atomic contention in inner loop.
+    # Each program accumulates locally, then writes once to Per_program buffers.
+    local_skipped = tl.zeros([PADDED_THRESHOLDS], dtype=tl.int32)
+    num_tiles = 0
+
     kv_bound = seq_len_kv if not IS_CAUSAL else tl.minimum((tile_q + 1) * BLOCK_M, seq_len_kv)
 
     for kv_start in range(0, kv_bound, BLOCK_N):
@@ -1146,14 +1212,14 @@ def _attn_fwd_calibrate(
 
         tile_row_max = tl.max(scores, 1)
 
-        # --- Multi-threshold sparsity measurement ---
-        for t in range(NUM_THRESHOLDS):
-            thresh = tl.load(Threshold_trials + t)
-            can_skip = tile_row_max < (row_max + thresh)
-            skip_tile = tl.min(can_skip.to(tl.int32)) == 1
-            tl.atomic_add(Sparsity_counters + t * 2, 1)  # total tiles
-            if skip_tile:
-                tl.atomic_add(Sparsity_counters + t * 2 + 1, 1)  # skipped tiles
+        # --- Vectorized multi-threshold sparsity measurement ---
+        # Compute the "hardest to skip" gap across all Q rows in this tile.
+        # A tile is skipped iff ALL rows satisfy: tile_row_max < row_max + thresh.
+        # Equivalently: min(tile_row_max - row_max) < thresh.
+        min_gap = tl.min(tile_row_max - row_max)  # scalar
+        skip_mask = (min_gap < thresholds).to(tl.int32)  # [PADDED_THRESHOLDS]
+        local_skipped += skip_mask
+        num_tiles += 1
 
         # --- Always compute full attention (no skipping) ---
         m_new = tl.maximum(row_max, tile_row_max)
@@ -1172,7 +1238,24 @@ def _attn_fwd_calibrate(
         acc = tl.dot(p.to(v.dtype), v, acc)
         row_max = m_new
 
-    acc = acc / row_sum[:, None]
+    # --- Write per-program counters (no atomics, just stores) ---
+    # Compute unique flat program index for this (batch, head, q_tile)
+    num_q_tiles = tl.cdiv(tl.load(b_seq_len + 0), BLOCK_M)  # conservative upper bound
+    num_heads = tl.num_programs(1)
+    prog_idx = batch_idx * num_heads * num_q_tiles + head_idx * num_q_tiles + tile_q
+    base = prog_idx * NUM_THRESHOLDS
+    tl.store(
+        Per_program_totals + base + thresh_offs,
+        tl.full([PADDED_THRESHOLDS], num_tiles, dtype=tl.int32),
+        mask=thresh_mask,
+    )
+    tl.store(
+        Per_program_skipped + base + thresh_offs,
+        local_skipped,
+        mask=thresh_mask,
+    )
+
+    acc = acc / tl.maximum(row_sum[:, None], 1e-6)
     o_ptrs = (q_offset + q_pos[:, None]) * stride_obs + head_idx * stride_oh + dim_pos[None, :]
     tl.store(Out + o_ptrs, acc, mask=(q_pos[:, None] < seq_len_q) & d_mask[None, :])
 
@@ -1239,12 +1322,19 @@ def attention_calibrate(
         device=q.device,
     )
 
-    # Atomic counters: [num_thresholds * 2] — flat layout [total_0, skip_0, total_1, skip_1, ...]
-    sparsity_counters = torch.zeros(num_thresholds * 2, dtype=torch.int64, device=q.device)
-
     o = torch.empty_like(q)
 
-    grid = (batch, num_q_heads, triton.cdiv(max_input_len, BLOCK_M))
+    num_q_tiles = triton.cdiv(max_input_len, BLOCK_M)
+    grid = (batch, num_q_heads, num_q_tiles)
+    num_programs = batch * num_q_heads * num_q_tiles
+
+    # Per-program output buffers (no atomics needed — each program writes its own row)
+    per_program_totals = torch.zeros(
+        num_programs * num_thresholds, dtype=torch.int32, device=q.device
+    )
+    per_program_skipped = torch.zeros(
+        num_programs * num_thresholds, dtype=torch.int32, device=q.device
+    )
 
     _attn_fwd_calibrate[grid](
         q,
@@ -1265,7 +1355,8 @@ def attention_calibrate(
         o.stride(0),
         o.stride(1),
         threshold_tensor,
-        sparsity_counters,
+        per_program_totals,
+        per_program_skipped,
         kv_group_num=kv_group_num,
         BLOCK_M=BLOCK_M,
         BLOCK_D=BLOCK_D,
@@ -1273,12 +1364,17 @@ def attention_calibrate(
         IS_CAUSAL=is_causal,
         HEAD_DIM=HEAD_DIM,
         NUM_THRESHOLDS=num_thresholds,
+        PADDED_THRESHOLDS=triton.next_power_of_2(num_thresholds),
         num_warps=4,
         num_stages=1,
     )
 
-    # Reshape to [num_thresholds, 2]
-    return o, sparsity_counters.view(num_thresholds, 2)
+    # Reduce across programs: sum per-program counts → [num_thresholds]
+    totals = per_program_totals.view(num_programs, num_thresholds).sum(dim=0).to(torch.int64)
+    skipped = per_program_skipped.view(num_programs, num_thresholds).sum(dim=0).to(torch.int64)
+    sparsity_counters = torch.stack([totals, skipped], dim=1)  # [num_thresholds, 2]
+
+    return o, sparsity_counters
 
 
 __all__ = ["attention", "attention_calibrate"]
diff --git a/modelopt/torch/sparsity/attention_sparsity/config.py b/modelopt/torch/sparsity/attention_sparsity/config.py
index fa415b322b..42c8b0be8e 100644
--- a/modelopt/torch/sparsity/attention_sparsity/config.py
+++ b/modelopt/torch/sparsity/attention_sparsity/config.py
@@ -139,6 +139,17 @@ class SparseAttentionAttributeConfig(ModeloptBaseConfig):
         ),
     )
 
+    skip_softmax_raw_threshold: float | None = ModeloptField(
+        default=None,
+        title="Raw skip-softmax threshold (skip_threshold_log2).",
+        description=(
+            "Raw value passed directly to the Triton kernel as skip_threshold_log2. "
+            "The kernel skips tiles where tile_row_max < row_max + raw_threshold. "
+            "Typical values are negative (e.g., -5.0). Takes precedence over "
+            "skip_softmax_threshold and calibration when set."
+        ),
+    )
+
     @field_validator("method")
     @classmethod
     def validate_method(cls, v):
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
index 8de08e8cb5..2923447cf0 100644
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_triton_attention.py
@@ -50,6 +50,8 @@ def set_triton_skip_softmax_config(
     calibration_mode: bool = False,
     threshold_trials: list[float] | None = None,
     scale_factor: float | None = None,
+    raw_threshold: float | None = None,
+    measure_sparsity: bool = False,
 ) -> None:
     """Set thread-local skip-softmax config for the next Triton attention call.
 
@@ -62,14 +64,23 @@ def set_triton_skip_softmax_config(
         scale_factor: Calibrated scale factor for dynamic threshold computation.
             When set, the actual threshold is computed as ``scale_factor / seq_k``
             at attention call time, adapting to the actual sequence length.
+        raw_threshold: Raw ``skip_threshold_log2`` value passed directly to the
+            kernel without conversion. Takes precedence over other thresholds.
+        measure_sparsity: If True, count total and skipped tiles during
+            inference via atomic counters in the forward kernel.
     """
     _thread_local.skip_threshold = threshold
     _thread_local.calibration_mode = calibration_mode
     _thread_local.threshold_trials = threshold_trials
     _thread_local.scale_factor = scale_factor
+    _thread_local.raw_threshold = raw_threshold
+    _thread_local.measure_sparsity = measure_sparsity
     # Accumulated counters across all attention calls in one forward pass
     _thread_local.calibration_counters = None
     _thread_local.calibration_seq_k = None
+    # Accumulated runtime sparsity counters (total_tiles, skipped_tiles)
+    _thread_local.sparsity_total = 0
+    _thread_local.sparsity_skipped = 0
 
 
 def clear_triton_skip_softmax_config() -> None:
@@ -78,8 +89,12 @@ def clear_triton_skip_softmax_config() -> None:
     _thread_local.calibration_mode = False
     _thread_local.threshold_trials = None
     _thread_local.scale_factor = None
+    _thread_local.raw_threshold = None
+    _thread_local.measure_sparsity = False
     _thread_local.calibration_counters = None
     _thread_local.calibration_seq_k = None
+    _thread_local.sparsity_total = 0
+    _thread_local.sparsity_skipped = 0
 
 
 def get_calibration_counters() -> "torch.Tensor | None":
@@ -92,6 +107,14 @@ def get_calibration_seq_k() -> int | None:
     return getattr(_thread_local, "calibration_seq_k", None)
 
 
+def get_sparsity_counters() -> tuple[int, int]:
+    """Return accumulated runtime sparsity counters ``(total_tiles, skipped_tiles)``."""
+    return (
+        getattr(_thread_local, "sparsity_total", 0),
+        getattr(_thread_local, "sparsity_skipped", 0),
+    )
+
+
 # ---------------------------------------------------------------------------
 # Triton attention implementation for diffusers layout
 # ---------------------------------------------------------------------------
@@ -158,18 +181,34 @@ def _diffusers_triton_attention(
 
             return o.view(batch, seq_q, num_heads_q, head_dim)
 
-    # --- Inference mode: skip-softmax with dynamic or static threshold ---
-    scale_factor = getattr(_thread_local, "scale_factor", None)
-    if scale_factor is not None and scale_factor > 0.0:
-        # Dynamic threshold: adapt to actual sequence length
-        kw["skip_softmax_threshold"] = scale_factor / seq_k
+    # --- Inference mode: skip-softmax with raw, dynamic, or static threshold ---
+    raw_thresh = getattr(_thread_local, "raw_threshold", None)
+    if raw_thresh is not None:
+        # Raw threshold: passed directly to kernel as skip_threshold_log2
+        kw["skip_softmax_raw_threshold"] = raw_thresh
     else:
-        threshold = getattr(_thread_local, "skip_threshold", None)
-        if threshold is not None and threshold > 0.0:
-            kw["skip_softmax_threshold"] = threshold
+        scale_factor = getattr(_thread_local, "scale_factor", None)
+        if scale_factor is not None and scale_factor > 0.0:
+            # Dynamic threshold: adapt to actual sequence length
+            kw["skip_softmax_threshold"] = scale_factor / seq_k
+        else:
+            threshold = getattr(_thread_local, "skip_threshold", None)
+            if threshold is not None and threshold > 0.0:
+                kw["skip_softmax_threshold"] = threshold
 
     assert attention is not None, "Triton attention kernel not available (requires CUDA + triton)"
+    do_measure = getattr(_thread_local, "measure_sparsity", False)
+    if do_measure:
+        kw["measure_sparsity"] = True
     o = attention(q, k, v, **kw)
+
+    # Accumulate runtime sparsity counters from the kernel output
+    if do_measure and hasattr(o, "_sparsity_total"):
+        prev_total = getattr(_thread_local, "sparsity_total", 0)
+        prev_skipped = getattr(_thread_local, "sparsity_skipped", 0)
+        _thread_local.sparsity_total = prev_total + o._sparsity_total
+        _thread_local.sparsity_skipped = prev_skipped + o._sparsity_skipped
+
     return o.view(batch, seq_q, num_heads_q, head_dim)
 
 
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
index a8d2e6e8db..a68a2512a1 100644
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_triton_attention.py
@@ -38,6 +38,8 @@ def set_ltx_triton_context(
     calibration_mode: bool = False,
     threshold_trials: list[float] | None = None,
     scale_factor: float | None = None,
+    raw_threshold: float | None = None,
+    **kwargs,
 ) -> None:
     """Set thread-local Triton config for LTX-2 attention."""
     _thread_local.active = active
@@ -45,6 +47,7 @@ def set_ltx_triton_context(
     _thread_local.calibration_mode = calibration_mode
     _thread_local.threshold_trials = threshold_trials
     _thread_local.scale_factor = scale_factor
+    _thread_local.raw_threshold = raw_threshold
     if not calibration_mode:
         _thread_local.calibration_counters = None
     _thread_local.calibration_seq_k = None
@@ -57,6 +60,7 @@ def clear_ltx_triton_context() -> None:
     _thread_local.calibration_mode = False
     _thread_local.threshold_trials = None
     _thread_local.scale_factor = None
+    _thread_local.raw_threshold = None
     _thread_local.calibration_counters = None
     _thread_local.calibration_seq_k = None
 
@@ -136,10 +140,12 @@ def _ltx_triton_attention(
 
             return o.view(b, seq_q, heads * dim_head)
 
-    # --- Inference mode: dynamic or static threshold ---
+    # --- Inference mode: raw, dynamic, or static threshold ---
+    raw_thresh = getattr(_thread_local, "raw_threshold", None)
     scale_factor = getattr(_thread_local, "scale_factor", None)
-    if scale_factor is not None and scale_factor > 0.0:
-        # Dynamic threshold: adapt to actual sequence length
+    if raw_thresh is not None:
+        kw["skip_softmax_raw_threshold"] = raw_thresh
+    elif scale_factor is not None and scale_factor > 0.0:
         kw["skip_softmax_threshold"] = scale_factor / seq_k
     elif threshold is not None and threshold > 0.0:
         kw["skip_softmax_threshold"] = threshold
@@ -156,7 +162,7 @@ def __init__(self, original_fn):
         self._original_fn = original_fn
 
     def __call__(self, q, k, v, heads, mask=None):
-        active, threshold, scale_factor = _get_ltx_triton_context()
+        active, threshold, _scale_factor = _get_ltx_triton_context()
         if active:
             return _ltx_triton_attention(q, k, v, heads, mask, threshold)
         return self._original_fn(q, k, v, heads, mask)
diff --git a/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py b/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
index c52a0ddde7..ec00eed0c2 100644
--- a/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
+++ b/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
@@ -47,8 +47,15 @@ def __init__(self, method_config=None):
         super().__init__()
         method_config = method_config or {}
         self.skip_softmax_threshold = method_config.get("skip_softmax_threshold", 0.1)
+        self.skip_softmax_raw_threshold: float | None = method_config.get(
+            "skip_softmax_raw_threshold", None
+        )
         # Calibration state
         self._threshold_trials: list[float] | None = None
+        # Runtime sparsity measurement
+        self._measure_sparsity: bool = False
+        self._sparsity_total: int = 0
+        self._sparsity_skipped: int = 0
 
     @property
     def name(self) -> str:
@@ -83,18 +90,28 @@ def _triton_inference_context(self, module):
         """Inference: activate skip-softmax with calibrated or fixed threshold."""
         module._apply_skip_softmax = True
 
-        # When calibrated, pass scale_factor so backends compute
-        # threshold = scale_factor / seq_k at call time (adapts to actual seqlen).
-        # Otherwise fall back to the static threshold.
-        scale_factor = self._get_scale_factor()
-        if scale_factor is not None:
-            self._set_triton_backends(scale_factor=scale_factor)
+        backend_kwargs: dict = {}
+        if self._measure_sparsity:
+            backend_kwargs["measure_sparsity"] = True
+
+        # Priority: raw_threshold > scale_factor (calibrated) > static threshold
+        if self.skip_softmax_raw_threshold is not None:
+            self._set_triton_backends(
+                raw_threshold=self.skip_softmax_raw_threshold, **backend_kwargs
+            )
         else:
-            self._set_triton_backends(threshold=self.skip_softmax_threshold)
+            scale_factor = self._get_scale_factor()
+            if scale_factor is not None:
+                self._set_triton_backends(scale_factor=scale_factor, **backend_kwargs)
+            else:
+                self._set_triton_backends(threshold=self.skip_softmax_threshold, **backend_kwargs)
         with self._get_diffusers_backend_context():
             try:
                 yield
             finally:
+                # Collect accumulated runtime sparsity counters before clearing
+                if self._measure_sparsity:
+                    self._collect_sparsity_counters()
                 module._apply_skip_softmax = False
                 self._clear_triton_backends()
 
@@ -233,3 +250,31 @@ def get_threshold_info(self) -> dict:
             "type": "static",
             "value": self.skip_softmax_threshold,
         }
+
+    # ------------------------------------------------------------------
+    # Runtime sparsity measurement
+    # ------------------------------------------------------------------
+
+    def enable_measure_sparsity(self, enabled: bool = True) -> None:
+        """Enable or disable runtime sparsity measurement."""
+        self._measure_sparsity = enabled
+
+    def reset_sparsity_counters(self) -> None:
+        """Reset accumulated sparsity counters to zero."""
+        self._sparsity_total = 0
+        self._sparsity_skipped = 0
+
+    def get_sparsity_counters(self) -> tuple[int, int]:
+        """Return accumulated ``(total_tiles, skipped_tiles)``."""
+        return self._sparsity_total, self._sparsity_skipped
+
+    def _collect_sparsity_counters(self) -> None:
+        """Read runtime sparsity counters from the backend and accumulate."""
+        try:
+            from ..kernels.diffusers_triton_attention import get_sparsity_counters
+
+            total, skipped = get_sparsity_counters()
+            self._sparsity_total += total
+            self._sparsity_skipped += skipped
+        except ImportError:
+            pass

From b86d3113150019e902acabb0da2f4afda0485965 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Wed, 8 Apr 2026 15:36:15 -0700
Subject: [PATCH 15/21] Remove the eager attention

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 .../sparsity/attention_sparsity/conversion.py |  20 +--
 .../attention_sparsity/kernels/__init__.py    |  31 +---
 .../kernels/diffusers_eager_attention.py      | 147 ------------------
 .../kernels/ltx_eager_attention.py            | 114 --------------
 4 files changed, 7 insertions(+), 305 deletions(-)
 delete mode 100644 modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_eager_attention.py
 delete mode 100644 modelopt/torch/sparsity/attention_sparsity/kernels/ltx_eager_attention.py

diff --git a/modelopt/torch/sparsity/attention_sparsity/conversion.py b/modelopt/torch/sparsity/attention_sparsity/conversion.py
index 3a1d52fbd4..df84fc270f 100644
--- a/modelopt/torch/sparsity/attention_sparsity/conversion.py
+++ b/modelopt/torch/sparsity/attention_sparsity/conversion.py
@@ -116,39 +116,31 @@ def is_attn_sparsified(model: nn.Module) -> bool:
 
 
 def _register_diffusers_backends_if_needed(model: nn.Module) -> None:
-    """Register diffusers/LTX attention backends if the model needs them.
+    """Register diffusers/LTX Triton attention backends if the model needs them.
 
     Called before plugin registration so that the backends are available
     when ``SparseAttentionModule.forward()`` activates the skip-softmax context.
     """
-    # Register the diffusers eager and Triton backends if the model is a diffusers ModelMixin
+    import contextlib
+
+    # Register the diffusers Triton backend if the model is a diffusers ModelMixin
     try:
         from diffusers.models.modeling_utils import ModelMixin
 
         if isinstance(model, ModelMixin):
-            from .kernels import (
-                register_diffusers_eager_attention,
-                register_diffusers_triton_attention,
-            )
+            from .kernels import register_diffusers_triton_attention
 
-            if register_diffusers_eager_attention is not None:
-                register_diffusers_eager_attention()
             if register_diffusers_triton_attention is not None:
                 register_diffusers_triton_attention()
     except (ImportError, Exception):
         pass
 
     # Patch ltx_core Attention modules if present (independent of diffusers)
-    import contextlib
-
     try:
-        from .kernels import register_ltx_eager_attention, register_ltx_triton_attention
+        from .kernels import register_ltx_triton_attention
     except (ImportError, RuntimeError):
         return
 
-    if register_ltx_eager_attention is not None:
-        with contextlib.suppress(Exception):
-            register_ltx_eager_attention(model)
     if register_ltx_triton_attention is not None:
         with contextlib.suppress(Exception):
             register_ltx_triton_attention(model)
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py b/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py
index 81f4295bb4..e8111cb0fb 100644
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py
@@ -13,61 +13,32 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-"""Kernel integrations for sparse attention: Triton FA and diffusers backends."""
+"""Kernel integrations for sparse attention: Triton FA and diffusers/LTX backends."""
 
 import contextlib
-import threading
 
 # ---------------------------------------------------------------------------
 # Triton FA kernel re-exports (for HuggingFace LLM integration)
 # ---------------------------------------------------------------------------
 from modelopt.torch.kernels import IS_AVAILABLE, attention, register_triton_attention
 
-# ---------------------------------------------------------------------------
-# Thread-local context: shared by diffusers eager and Triton backends
-# ---------------------------------------------------------------------------
-_thread_local = threading.local()
-
-
-def set_skip_softmax_context(active: bool) -> None:
-    """Set thread-local flag indicating skip-softmax eager attention is active."""
-    _thread_local.skip_softmax_active = active
-
-
-def get_skip_softmax_context() -> bool:
-    """Return True if skip-softmax eager attention is active in this thread."""
-    return getattr(_thread_local, "skip_softmax_active", False)
-
-
 # ---------------------------------------------------------------------------
 # Optional backend registrations (depend on diffusers / ltx_core)
 # ---------------------------------------------------------------------------
-register_diffusers_eager_attention = None
 register_diffusers_triton_attention = None
-register_ltx_eager_attention = None
 register_ltx_triton_attention = None
 
 # Suppress ImportError (missing package) and RuntimeError (triton without GPU driver)
-with contextlib.suppress(ImportError, RuntimeError):
-    from .diffusers_eager_attention import register_diffusers_eager_attention
-
 with contextlib.suppress(ImportError, RuntimeError):
     from .diffusers_triton_attention import register_diffusers_triton_attention
 
-with contextlib.suppress(ImportError, RuntimeError):
-    from .ltx_eager_attention import register_ltx_eager_attention
-
 with contextlib.suppress(ImportError, RuntimeError):
     from .ltx_triton_attention import register_ltx_triton_attention
 
 __all__ = [
     "IS_AVAILABLE",
     "attention",
-    "get_skip_softmax_context",
-    "register_diffusers_eager_attention",
     "register_diffusers_triton_attention",
-    "register_ltx_eager_attention",
     "register_ltx_triton_attention",
     "register_triton_attention",
-    "set_skip_softmax_context",
 ]
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_eager_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_eager_attention.py
deleted file mode 100644
index 16dd895f27..0000000000
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/diffusers_eager_attention.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Eager attention backend for diffusers skip-softmax sparse attention.
-
-Registers a ``modelopt_skip_softmax`` backend in diffusers'
-``_AttentionBackendRegistry`` that computes attention eagerly with an explicit
-``F.softmax`` call.  This allows the existing softmax-patching mechanism in
-``SparseAttentionModule`` to intercept and apply block-wise sparsity.
-
-Used during **calibration only** — inference uses the Triton FA kernel.
-"""
-
-import inspect
-import math
-
-import torch
-import torch.nn.functional as F
-from diffusers.models.attention_dispatch import (
-    AttentionBackendName,
-    _AttentionBackendRegistry,
-    attention_backend,
-)
-
-_BACKEND_NAME = "modelopt_skip_softmax"
-_BACKEND_REGISTERED = False
-
-
-# ---------------------------------------------------------------------------
-# Eager attention implementation
-# ---------------------------------------------------------------------------
-
-
-def _diffusers_eager_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attn_mask: torch.Tensor | None = None,
-    dropout_p: float = 0.0,
-    is_causal: bool = False,
-    scale: float | None = None,
-    enable_gqa: bool = False,
-) -> torch.Tensor:
-    """Compute attention eagerly on diffusers layout ``[B, S, H, D]``.
-
-    The explicit ``F.softmax`` call is what the skip-softmax patch intercepts.
-    """
-    # Diffusers convention: [B, S, H, D] → transpose to [B, H, S, D]
-    query = query.transpose(1, 2)
-    key = key.transpose(1, 2)
-    value = value.transpose(1, 2)
-
-    # Handle GQA: repeat K/V heads to match Q heads
-    if enable_gqa and query.shape[1] != key.shape[1]:
-        num_heads_q = query.shape[1]
-        num_heads_kv = key.shape[1]
-        n_rep = num_heads_q // num_heads_kv
-        key = key.repeat_interleave(n_rep, dim=1)
-        value = value.repeat_interleave(n_rep, dim=1)
-
-    if scale is None:
-        scale = 1.0 / math.sqrt(query.shape[-1])
-
-    # Q @ K^T * scale
-    scores = torch.matmul(query, key.transpose(-2, -1)) * scale
-
-    # Apply attention mask if provided
-    if attn_mask is not None:
-        scores = scores + attn_mask
-
-    # Apply causal mask if needed
-    if is_causal:
-        seq_q, seq_k = scores.shape[-2], scores.shape[-1]
-        causal_mask = torch.triu(
-            torch.full((seq_q, seq_k), float("-inf"), device=scores.device, dtype=scores.dtype),
-            diagonal=seq_k - seq_q + 1,
-        )
-        scores = scores + causal_mask
-
-    # F.softmax — this is where the skip-softmax patch intercepts
-    scores = F.softmax(scores, dim=-1)
-
-    if dropout_p > 0.0:
-        scores = F.dropout(scores, p=dropout_p, training=True)
-
-    # scores @ V
-    out = torch.matmul(scores, value)
-
-    # Transpose back: [B, H, S, D] → [B, S, H, D]
-    out = out.transpose(1, 2)
-    return out
-
-
-# ---------------------------------------------------------------------------
-# Registration
-# ---------------------------------------------------------------------------
-
-
-def register_diffusers_eager_attention() -> None:
-    """Register ``modelopt_skip_softmax`` backend in diffusers.
-
-    Safe to call multiple times; registration happens only once.
-    """
-    global _BACKEND_REGISTERED
-    if _BACKEND_REGISTERED:
-        return
-
-    # Extend the AttentionBackendName enum with our custom value
-    new_member = str.__new__(AttentionBackendName, _BACKEND_NAME)
-    new_member._name_ = "MODELOPT_SKIP_SOFTMAX"
-    new_member._value_ = _BACKEND_NAME
-    AttentionBackendName._member_map_["MODELOPT_SKIP_SOFTMAX"] = new_member
-    AttentionBackendName._value2member_map_[_BACKEND_NAME] = new_member
-
-    # Register the backend function
-    _AttentionBackendRegistry._backends[new_member] = _diffusers_eager_attention
-    _AttentionBackendRegistry._constraints[new_member] = []
-    _AttentionBackendRegistry._supported_arg_names[new_member] = set(
-        inspect.signature(_diffusers_eager_attention).parameters.keys()
-    )
-
-    _BACKEND_REGISTERED = True
-
-
-def get_skip_softmax_attention_backend():
-    """Return a context manager that activates the modelopt_skip_softmax backend.
-
-    Raises RuntimeError if the backend has not been registered yet.
-    """
-    if not _BACKEND_REGISTERED:
-        raise RuntimeError(
-            "modelopt_skip_softmax backend not registered. "
-            "Call register_diffusers_eager_attention() first."
-        )
-    return attention_backend(_BACKEND_NAME)
diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_eager_attention.py b/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_eager_attention.py
deleted file mode 100644
index 6c082ee588..0000000000
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/ltx_eager_attention.py
+++ /dev/null
@@ -1,114 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-"""Eager attention wrapper for LTX-2 (ltx_core) skip-softmax sparse attention.
-
-Patches ``Attention`` modules from ``ltx_core`` so that when the skip-softmax
-thread-local flag is active, attention is computed eagerly with an explicit
-``F.softmax`` call that the softmax-patching mechanism can intercept.
-
-Used during **calibration only** — inference uses the Triton FA kernel via
-the diffusers Triton backend.
-"""
-
-import math
-
-import torch
-import torch.nn.functional as F
-from ltx_core.model.transformer.attention import Attention
-
-from . import get_skip_softmax_context
-
-
-def _ltx_eager_attention(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    v: torch.Tensor,
-    heads: int,
-    mask: torch.Tensor | None = None,
-) -> torch.Tensor:
-    """Eager attention on LTX-2 layout ``[B, T, H*D]``.
-
-    Mirrors the ``PytorchAttention`` class in ltx_core but uses an explicit
-    ``F.softmax`` instead of ``scaled_dot_product_attention``.
-    """
-    b, _, dim_total = q.shape
-    dim_head = dim_total // heads
-
-    # Reshape to [B, T, H, D] then transpose to [B, H, T, D]
-    q = q.view(b, -1, heads, dim_head).transpose(1, 2)
-    k = k.view(b, -1, heads, dim_head).transpose(1, 2)
-    v = v.view(b, -1, heads, dim_head).transpose(1, 2)
-
-    scale = 1.0 / math.sqrt(dim_head)
-
-    # Q @ K^T * scale
-    scores = torch.matmul(q, k.transpose(-2, -1)) * scale
-
-    # Apply mask if provided
-    if mask is not None:
-        # Expand mask dimensions to match scores [B, H, Sq, Sk]
-        if mask.ndim == 2:
-            mask = mask.unsqueeze(0)
-        if mask.ndim == 3:
-            mask = mask.unsqueeze(1)
-        scores = scores + mask
-
-    # F.softmax — intercepted by skip-softmax patch
-    scores = F.softmax(scores, dim=-1)
-
-    # scores @ V
-    out = torch.matmul(scores, v)
-
-    # [B, H, T, D] → [B, T, H*D]
-    out = out.transpose(1, 2).reshape(b, -1, heads * dim_head)
-    return out
-
-
-class _SkipSoftmaxLTXAttentionWrapper:
-    """Wraps an ``attention_function`` callable from ltx_core.
-
-    When the thread-local skip-softmax flag is active, routes to the eager
-    attention path.  Otherwise calls the original function.
-    """
-
-    def __init__(self, original_fn):
-        self._original_fn = original_fn
-
-    def __call__(
-        self,
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        heads: int,
-        mask: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        if get_skip_softmax_context():
-            return _ltx_eager_attention(q, k, v, heads, mask)
-        return self._original_fn(q, k, v, heads, mask)
-
-
-def register_ltx_eager_attention(model: torch.nn.Module) -> None:
-    """Walk *model* and patch all ``ltx_core.model.transformer.attention.Attention`` modules.
-
-    Patches modules so their ``attention_function`` is routed through the eager wrapper.
-    Safe to call multiple times on the same model — already-wrapped modules are
-    skipped.
-    """
-    for module in model.modules():
-        if isinstance(module, Attention):
-            fn = module.attention_function
-            if not isinstance(fn, _SkipSoftmaxLTXAttentionWrapper):
-                module.attention_function = _SkipSoftmaxLTXAttentionWrapper(fn)

From 45bcad65ac41bf9c41c801e02eeb621bef369213 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Wed, 8 Apr 2026 20:39:48 -0700
Subject: [PATCH 16/21] Update the calibration, fixed some bugs

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 examples/diffusers/sparsity/README.md         | 202 ++++++++++--------
 .../diffusers/sparsity/wan22_skip_softmax.py  |   4 +
 modelopt/torch/kernels/triton_fa.py           |  10 +-
 .../calibration/calibrate.py                  |  13 +-
 .../calibration/calibrator.py                 |  94 +++++---
 .../sparsity/attention_sparsity/config.py     |   9 +
 .../methods/triton_skip_softmax.py            |  17 ++
 7 files changed, 222 insertions(+), 127 deletions(-)

diff --git a/examples/diffusers/sparsity/README.md b/examples/diffusers/sparsity/README.md
index e4aea18f0d..71df2e1c5b 100644
--- a/examples/diffusers/sparsity/README.md
+++ b/examples/diffusers/sparsity/README.md
@@ -1,123 +1,139 @@
 # Skip-Softmax Sparse Attention for Diffusion Models
 
-Skip-softmax sparse attention (BLASST) skips KV tiles whose attention scores
-are negligible during the FlashAttention computation, reducing FLOPs without
-retraining. An exponential model (`scale_factor = a * exp(b * target_sparsity)`)
-is calibrated once, then the target sparsity can be adjusted at runtime without
-recalibration.
+Skip-softmax sparse attention (BLASST, https://arxiv.org/pdf/2512.12087) skips KV
+tiles whose attention scores are negligible during the FlashAttention computation,
+reducing FLOPs without retraining.
 
-## Changes from Main Branch
+Two modes are supported:
+- **Fixed raw threshold** — pass a log2-space threshold directly to the Triton
+  kernel. No calibration needed. Good for quick testing and sweeps.
+- **Calibrated threshold** — an exponential model
+  (`scale_factor = a * exp(b * target_sparsity)`) is calibrated once via the
+  Triton calibration kernel, then the target sparsity can be adjusted at runtime
+  without recalibration.
 
-### Core Triton Kernel (`modelopt/torch/kernels/`)
+## Supported Models
 
-| File | Change |
-|------|--------|
-| `triton_fa.py` | Added `_attn_fwd_calibrate` kernel: computes full attention while measuring skip decisions for multiple thresholds via atomic counters. Added `attention_calibrate()` Python API. |
-| `__init__.py` | Export `attention_calibrate` alongside `attention`. |
+| Model | Script | Notes |
+|-------|--------|-------|
+| WAN 2.2 5B | `wan22_skip_softmax.py` | Single transformer, self-attention only |
+| WAN 2.2 14B | `wan22_skip_softmax.py` | Dual transformer (auto-detected) |
+| LTX-2 | (coming soon) | Via `ltx_triton_attention.py` backend |
 
-The kernel has two modes:
-- **Inference** (`_attn_fwd`): Autotuned, single threshold, actual tile skipping.
-- **Calibration** (`_attn_fwd_calibrate`): Fixed block sizes (128×64), multi-threshold measurement, no skipping (full attention output).
-
-### Sparse Attention Methods (`modelopt/torch/sparsity/attention_sparsity/methods/`)
-
-| File | Change |
-|------|--------|
-| `triton_skip_softmax.py` | Extended with calibration support: `_triton_calibration_context()` sets Triton calibration mode and collects counters; `_triton_inference_context()` activates diffusers backend with calibrated threshold; `_get_diffusers_backend_context()` activates `modelopt_triton` attention backend. |
-| `flash_skip_softmax.py` | Enhanced `get_sparse_context()` with `ExitStack` to also activate diffusers eager backend for calibration. |
-| `registry.py` | Added `set_calibration_mode()` to base `SparseAttentionMethod` class. |
-| `__init__.py` | Updated imports. |
-
-### Kernel Backends (`modelopt/torch/sparsity/attention_sparsity/kernels/`)
-
-| File | Change |
-|------|--------|
-| `__init__.py` | Added thread-local context (`set_skip_softmax_context` / `get_skip_softmax_context`), lazy imports for diffusers/LTX backends with `contextlib.suppress(ImportError, RuntimeError)`. |
-| `diffusers_triton_attention.py` | **New.** Registers `modelopt_triton` backend in diffusers. Two modes: inference calls `attention()`, calibration calls `attention_calibrate()`. Accumulates counters across attention calls. |
-| `diffusers_eager_attention.py` | **New.** Registers `modelopt_skip_softmax` eager backend for LLM calibration (explicit `F.softmax` for patching). |
-| `ltx_triton_attention.py` | **New.** Patches `ltx_core.Attention` modules for Triton dispatch. Supports calibration and inference modes. |
-| `ltx_eager_attention.py` | **New.** Patches `ltx_core.Attention` for eager attention calibration. |
-
-### Calibration (`modelopt/torch/sparsity/attention_sparsity/calibration/`)
-
-| File | Change |
-|------|--------|
-| `calibrate.py` | Skip RULER dataset generation when user provides `forward_loop` (required for diffusion models). Guard `from transformers import AutoTokenizer` as lazy import. |
-| `calibrator.py` | `_set_thresholds()` detects method type — sets `_threshold_trials` for `triton_skip_softmax`, `thresholds` for `flash_skip_softmax`. |
-
-### Conversion & Config
-
-| File | Change |
-|------|--------|
-| `conversion.py` | Added `_register_diffusers_backends_if_needed()` — auto-registers diffusers/LTX backends on `sparsify()`. Updated export config and summary display. |
-| `config.py` | Added `skip_softmax_threshold` field to `SparseAttentionAttributeConfig`. |
-| `plugins/huggingface.py` | Added diffusers `ModelMixin` support in `_is_supported_model()`. Lazy `import transformers`. |
-| `stats_manager.py` | Made `sparse_blocks` optional in `collect()`. Preserve `normalized_gaps` in calibration stats. |
-| `sparse_attention.py` | (Changes from main for VSA support also present.) |
-
-### Example Scripts
-
-| File | Description |
-|------|-------------|
-| `wan22_skip_softmax.py` | **New.** Wan 2.2 text-to-video with skip-softmax. Supports 5B (single transformer) and 14B (dual transformer). Uses `triton_skip_softmax` with Triton calibration kernel. Calibration prompts from OpenVid-1M. |
-
-### Tests
-
-| File | Description |
-|------|-------------|
-| `test_kernel_backends.py` | **New.** Unit tests for diffusers kernel backends with mocked dependencies (no GPU required). |
-
-## Usage
+## Quick Start
 
 ```bash
-# Wan 2.2 5B — calibrate + generate
+# Fixed raw threshold (no calibration, fast)
 python wan22_skip_softmax.py \
-    --model-path Wan-AI/Wan2.2-TI2V-5B-Diffusers \
-    --calibrate --target-sparsity 0.5 --calib-size 4 \
-    --calib-frames 151 --calib-steps 40 \
-    --prompt "A cat sitting on a windowsill" --output out.mp4
+    --model-path /path/to/Wan2.2-T2V-A14B-Diffusers \
+    --raw-threshold -0.7 \
+    --prompt "A cat playing piano" --output out.mp4
 
-# Wan 2.2 14B — both transformers sparsified
+# With calibration
 python wan22_skip_softmax.py \
-    --model-path Wan-AI/Wan2.2-T2V-A14B-Diffusers \
-    --calibrate --target-sparsity 0.5 --calib-size 4 \
-    --calib-frames 151 --calib-steps 40 \
-    --prompt "A sunset over mountains" --output out.mp4
+    --model-path /path/to/Wan2.2-T2V-A14B-Diffusers \
+    --calibrate --target-sparsity 0.5 \
+    --prompt "A cat playing piano" --output out.mp4
 
-# Calibrate only (no video generation)
+# Dense baseline (no sparsity, for comparison)
 python wan22_skip_softmax.py \
-    --model-path Wan-AI/Wan2.2-TI2V-5B-Diffusers \
-    --calibrate --target-sparsity 0.5 --calib-size 4
+    --model-path /path/to/Wan2.2-T2V-A14B-Diffusers \
+    --baseline \
+    --prompt "A cat playing piano" --output baseline.mp4
+
+# Report runtime sparsity (per-layer tile skip ratios)
+python wan22_skip_softmax.py \
+    --raw-threshold -0.7 --report-avg-sparsity \
+    --prompt "A cat playing piano" --output out.mp4
 ```
 
 ## Architecture
 
+### Inference Path (Triton kernel with tile skipping)
+
+```text
+SparseAttentionModule.forward()
+  └─ triton_skip_softmax._triton_inference_context()
+       ├─ Priority: raw_threshold > scale_factor (calibrated) > static threshold
+       ├─ _set_triton_backends(raw_threshold=X)  or  (scale_factor=X)
+       ├─ attention_backend("modelopt_triton")
+       └─ _diffusers_triton_attention() → attention()
+            └─ _attn_fwd kernel: skip tiles where tile_row_max < row_max + threshold
+```
+
+### Calibration Path (Triton calibration kernel)
+
 ```text
 mtsa.sparsify(transformer, config, forward_loop)
-  │
   ├─ apply_mode() → replace attention with SparseAttentionModule
-  │
   └─ calibrate()
-       │
        ├─ DynamicThresholdCalibrator._set_thresholds()
-       │    └─ sets method._threshold_trials = [1e-6, ..., 9.9e-1]
-       │
+       │    └─ method._threshold_trials = [1e-6, ..., 9.9e-1]
        ├─ forward_loop(model)
-       │    │
        │    └─ SparseAttentionModule.forward()
-       │         │
        │         └─ triton_skip_softmax._triton_calibration_context()
        │              ├─ set_triton_skip_softmax_config(calibration_mode=True)
        │              ├─ attention_backend("modelopt_triton")
-       │              ├─ _diffusers_triton_attention() → attention_calibrate()
-       │              │    └─ _attn_fwd_calibrate kernel (full attn + atomic counters)
-       │              └─ _collect_calibration_stats() → module._last_stats
-       │
+       │              └─ _diffusers_triton_attention() → attention_calibrate()
+       │                   └─ _attn_fwd_calibrate kernel:
+       │                        - Full attention (no skipping) for correct output
+       │                        - Vectorized multi-threshold sparsity measurement
+       │                        - Per-program output buffers (no atomic contention)
+       │                        - Python-side reduction: sum across programs
        ├─ Fit: scale_factor = a * exp(b * sparsity)
-       │
        └─ Apply a, b to all modules
-            │
-            └─ Inference: triton_skip_softmax._triton_inference_context()
-                 ├─ threshold = a * exp(b * target) / seqlen
-                 └─ attention() with skip_softmax_threshold → actual tile skipping
+            └─ Inference: threshold = scale_factor / seq_k
 ```
+
+## Core Files
+
+### Triton Kernels (`modelopt/torch/kernels/`)
+
+| File | Role |
+|------|------|
+| `triton_fa.py` | `_attn_fwd`: forward kernel with optional tile skipping + sparsity measurement. `_attn_fwd_calibrate`: calibration kernel with vectorized multi-threshold testing and per-program buffers (zero atomic contention). `attention()` and `attention_calibrate()` Python APIs. |
+
+### Sparse Attention Methods (`modelopt/torch/sparsity/attention_sparsity/methods/`)
+
+| File | Role |
+|------|------|
+| `triton_skip_softmax.py` | Primary method for diffusion models. Calibration context → Triton calibration kernel. Inference context → Triton forward kernel. Supports `scale_factor` (calibrated), `raw_threshold` (direct), and static `skip_softmax_threshold`. |
+| `flash_skip_softmax.py` | PyTorch-based method for HF LLMs (not used by diffusers/LTX). |
+| `registry.py` | Base class `SparseAttentionMethod` with `calibration_params`, `target_sparse_ratio`, `set_calibration_mode()`. |
+
+### Kernel Backends (`modelopt/torch/sparsity/attention_sparsity/kernels/`)
+
+| File | Role |
+|------|------|
+| `diffusers_triton_attention.py` | Registers `modelopt_triton` backend in diffusers. Handles calibration mode (→ `attention_calibrate`) and inference mode (→ `attention` with `scale_factor/seq_k` or `raw_threshold`). Runtime sparsity counter accumulation. |
+| `ltx_triton_attention.py` | Patches `ltx_core.Attention` modules for Triton dispatch. Same calibration/inference modes. |
+| `hf_triton_attention.py` | HuggingFace `attn_implementation="modelopt_triton"` backend for LLMs. |
+
+### Calibration (`modelopt/torch/sparsity/attention_sparsity/calibration/`)
+
+| File | Role |
+|------|------|
+| `calibrate.py` | Orchestrates calibration. Skips RULER dataset when user provides `forward_loop` (diffusion models). Applies fitted (a, b) to all modules. |
+| `calibrator.py` | `DynamicThresholdCalibrator`: collects (scale_factor, sparsity) pairs via Triton calibration kernel, fits exponential model `scale_factor = a * exp(b * sparsity)`. |
+
+### Config & Conversion
+
+| File | Role |
+|------|------|
+| `config.py` | `SparseAttentionAttributeConfig` with `skip_softmax_threshold`, `skip_softmax_raw_threshold`, calibration settings. |
+| `conversion.py` | `_register_diffusers_backends_if_needed()` auto-registers Triton backends on `sparsify()`. |
+| `sparse_attention.py` | `SparseAttentionModule` wrapper — delegates to method's `get_sparse_context()`. |
+
+## Threshold Modes
+
+| Mode | How threshold reaches the kernel | Use case |
+|------|----------------------------------|----------|
+| **Raw threshold** (`--raw-threshold -0.7`) | Passed directly as `skip_threshold_log2` — no conversion | Quick testing, sweeps |
+| **Calibrated** (`--calibrate --target-sparsity 0.5`) | `scale_factor = a * exp(b * target)`, then backend computes `threshold = scale_factor / seq_k`, then kernel converts `log2(threshold) * sm_scale` | Production use with automatic seqlen adaptation |
+| **Static lambda** (default `skip_softmax_threshold=0.1`) | `log2(lambda) * sm_scale` | Fallback when neither raw nor calibrated |
+
+## Known Issues
+
+- **Calibration sparsity ratio**: The calibrated threshold goes through `log2(threshold) * sm_scale` conversion, producing `skip_threshold_log2` values in a different scale than raw thresholds. Needs investigation to ensure the fitted (a, b) parameters produce expected sparsity levels.
+- **14B dual transformer calibration**: Transformers are calibrated sequentially — transformer_2's calibration runs while transformer_1 is already sparsified, introducing asymmetric calibration conditions.
+
diff --git a/examples/diffusers/sparsity/wan22_skip_softmax.py b/examples/diffusers/sparsity/wan22_skip_softmax.py
index 95f42cb3ef..9824ce7251 100644
--- a/examples/diffusers/sparsity/wan22_skip_softmax.py
+++ b/examples/diffusers/sparsity/wan22_skip_softmax.py
@@ -73,6 +73,9 @@
 
 # Default threshold trials for calibration
 DEFAULT_THRESHOLD_TRIALS = [
+    1e-12,
+    1e-10,
+    1e-8,
     1e-6,
     5e-6,
     1e-5,
@@ -247,6 +250,7 @@ def build_sparse_config(args: argparse.Namespace, num_blocks: int) -> dict:
             "target_sparse_ratio": {"prefill": args.target_sparsity},
             "samples": 1,
             "threshold_trials": DEFAULT_THRESHOLD_TRIALS,
+            "fit_logspace": True,
         }
 
     return config
diff --git a/modelopt/torch/kernels/triton_fa.py b/modelopt/torch/kernels/triton_fa.py
index 9ea5586705..8044383889 100644
--- a/modelopt/torch/kernels/triton_fa.py
+++ b/modelopt/torch/kernels/triton_fa.py
@@ -1213,11 +1213,11 @@ def _attn_fwd_calibrate(
         tile_row_max = tl.max(scores, 1)
 
         # --- Vectorized multi-threshold sparsity measurement ---
-        # Compute the "hardest to skip" gap across all Q rows in this tile.
-        # A tile is skipped iff ALL rows satisfy: tile_row_max < row_max + thresh.
-        # Equivalently: min(tile_row_max - row_max) < thresh.
-        min_gap = tl.min(tile_row_max - row_max)  # scalar
-        skip_mask = (min_gap < thresholds).to(tl.int32)  # [PADDED_THRESHOLDS]
+        # A tile is skipped iff ALL Q rows satisfy: tile_row_max < row_max + thresh.
+        # Equivalently: max(tile_row_max - row_max) < thresh (worst-case row
+        # must still be below threshold for the tile to be skippable).
+        max_gap = tl.max(tile_row_max - row_max)  # scalar
+        skip_mask = (max_gap < thresholds).to(tl.int32)  # [PADDED_THRESHOLDS]
         local_skipped += skip_mask
         num_tiles += 1
 
diff --git a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py
index 3215a7530b..f63feac69e 100644
--- a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py
+++ b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrate.py
@@ -296,6 +296,7 @@ def calibrate_sparse_attention(
 
         prefill_calibrator = DynamicThresholdCalibrator(
             threshold_trials=calib_config.threshold_trials,
+            fit_logspace=calib_config.fit_logspace,
         )
         prefill_result = prefill_calibrator.calibrate(model, prefill_forward_loop, phase="prefill")
 
@@ -318,6 +319,7 @@ def calibrate_sparse_attention(
 
         decode_calibrator = DynamicThresholdCalibrator(
             threshold_trials=calib_config.threshold_trials,
+            fit_logspace=calib_config.fit_logspace,
         )
         decode_result = decode_calibrator.calibrate(model, decode_forward_loop, phase="decode")
 
@@ -331,15 +333,20 @@ def calibrate_sparse_attention(
         warnings.warn("No calibration produced valid results")
         return {}
 
-    # Extract a and b for each phase
+    # Extract a, b, and observed sparsity range for each phase
     calibration_params: dict[str, dict[str, float]] = {}
     for phase in ["prefill", "decode"]:
         if phase in calibration_results:
             result = calibration_results[phase]
-            calibration_params[phase] = {
+            params: dict[str, float] = {
                 "a": result["a"],
                 "b": result["b"],
             }
+            if "min_observed_sparsity" in result:
+                params["min_observed_sparsity"] = result["min_observed_sparsity"]
+            if "max_observed_sparsity" in result:
+                params["max_observed_sparsity"] = result["max_observed_sparsity"]
+            calibration_params[phase] = params
 
     # Apply calibration params to all modules
     print("\n" + "=" * 60)
@@ -349,7 +356,7 @@ def calibrate_sparse_attention(
     for phase, params in calibration_params.items():
         result = calibration_results[phase]
         print(f"  {phase}:")
-        print(f"    Model: scale_factor = {params['a']:.6f} * exp({params['b']:.4f} * sparsity)")
+        print(f"    Model: scale_factor = {params['a']:.6e} * exp({params['b']:.4f} * sparsity)")
         print(f"    R-squared: {result['r_squared']:.6f}")
 
     for module_name, module in sparse_modules:
diff --git a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py
index f2bb8831d7..34d8987175 100644
--- a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py
+++ b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py
@@ -55,12 +55,16 @@ class DynamicThresholdCalibrator:
     def __init__(
         self,
         threshold_trials: list[float] | None = None,
+        fit_logspace: bool = False,
     ):
         """Initialize dynamic threshold calibrator.
 
         Args:
             threshold_trials: List of thresholds to try during calibration.
                 Should span a range that achieves sparsities from ~10% to ~95%.
+            fit_logspace: If True, fit the exponential model in log space
+                (minimizes relative error). Recommended for diffusion models
+                where scale_factors span many orders of magnitude.
         """
         # Default threshold trials if not provided
         self.threshold_trials = threshold_trials or [
@@ -85,6 +89,7 @@ def __init__(
             9.5e-1,
             9.9e-1,
         ]
+        self.fit_logspace = fit_logspace
 
     def calibrate(self, model: nn.Module, forward_loop: Callable, phase: str) -> dict[str, Any]:
         """Calibrate a and b parameters for Exponential model.
@@ -167,6 +172,8 @@ def calibrate(self, model: nn.Module, forward_loop: Callable, phase: str) -> dic
         # Filter out extreme sparsities (must be in (10%, 90%))
         # Extreme values are unreliable for fitting
         valid_mask = (sparsities >= 0.10) & (sparsities <= 0.90)
+        if self.fit_logspace:
+            valid_mask &= scale_factors > 0  # log requires positive values
         scale_factors = scale_factors[valid_mask]
         sparsities = sparsities[valid_mask]
 
@@ -176,47 +183,81 @@ def calibrate(self, model: nn.Module, forward_loop: Callable, phase: str) -> dic
             )
             return {}
 
-        # Define Exponential model: sf = a * exp(b * S)
-        def exponential(sparsity, a, b):
-            return a * np.exp(b * sparsity)
+        # Record observed sparsity range for feasibility checks at inference
+        min_observed_sparsity = float(np.min(sparsities))
+        max_observed_sparsity = float(np.max(sparsities))
 
-        # Fit the model
         try:
-            popt, pcov = curve_fit(
-                exponential,
-                sparsities,
-                scale_factors,
-                p0=[1.0, 5.0],  # Initial guess
-                bounds=([0.0, 0.0], [np.inf, 20.0]),  # Bounds for a and b
-                maxfev=10000,
-            )
-            a, b = popt
+            if self.fit_logspace:
+                # Log-space fit: minimizes relative error. Recommended for
+                # diffusion models where scale_factors span many orders of
+                # magnitude (e.g. 0.06 to 57,000) — a linear-space fit would
+                # be dominated by the largest values.
+                log_scale_factors = np.log(scale_factors)
+
+                def log_exponential(sparsity, log_a, b):
+                    return log_a + b * sparsity
+
+                popt, pcov = curve_fit(
+                    log_exponential,
+                    sparsities,
+                    log_scale_factors,
+                    p0=[0.0, 10.0],
+                    maxfev=10000,
+                )
+                log_a, b = popt
+                a = np.exp(log_a)
+
+                # R-squared in log space (where the fit was performed)
+                pred = log_exponential(sparsities, log_a, b)
+                ss_res = np.sum((log_scale_factors - pred) ** 2)
+                ss_tot = np.sum((log_scale_factors - np.mean(log_scale_factors)) ** 2)
+            else:
+                # Linear-space fit (default): minimizes absolute error.
+
+                def exponential(sparsity, a, b):
+                    return a * np.exp(b * sparsity)
+
+                popt, pcov = curve_fit(
+                    exponential,
+                    sparsities,
+                    scale_factors,
+                    p0=[1.0, 5.0],
+                    bounds=([0.0, 0.0], [np.inf, 20.0]),
+                    maxfev=10000,
+                )
+                a, b = popt
+
+                pred = exponential(sparsities, a, b)
+                ss_res = np.sum((scale_factors - pred) ** 2)
+                ss_tot = np.sum((scale_factors - np.mean(scale_factors)) ** 2)
         except Exception as e:
             warnings.warn(f"Curve fitting failed: {e}")
             return {}
 
-        # Calculate R-squared and RMSE
-        pred_scale_factors = exponential(sparsities, a, b)
-        ss_res = np.sum((scale_factors - pred_scale_factors) ** 2)
-        ss_tot = np.sum((scale_factors - np.mean(scale_factors)) ** 2)
         r_squared = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0
-        rmse = np.sqrt(np.mean((scale_factors - pred_scale_factors) ** 2))
 
-        print(f"\n{phase.capitalize()} Calibration Results (Exponential Model):")
+        fit_label = "log-space" if self.fit_logspace else "linear-space"
+        print(f"\n{phase.capitalize()} Calibration Results (Exponential Model, {fit_label} fit):")
         print("  Model: scale_factor = a * exp(b * sparsity)")
-        print(f"  Fitted a: {a:.6f}")
+        print(f"  Fitted a: {a:.6e}")
         print(f"  Fitted b: {b:.4f}")
         print(f"  R-squared: {r_squared:.6f}")
-        print(f"  RMSE: {rmse:.2f}")
+        print(f"  Observed sparsity range: [{min_observed_sparsity:.1%}, {max_observed_sparsity:.1%}]")
         print(f"  Data points used: {int(np.sum(valid_mask))} / {len(all_data_points)}")
 
         # Show scale_factor for various target sparsities
         print("\nScale factors for different target sparsities:")
-        print(f"  {'Target':<10} {'Scale Factor':<15}")
-        print(f"  {'-' * 10} {'-' * 15}")
-        for target in [0.5, 0.7, 0.8, 0.9, 0.95]:
+        print(f"  {'Target':<10} {'Scale Factor':<15} {'Note':<20}")
+        print(f"  {'-' * 10} {'-' * 15} {'-' * 20}")
+        for target in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
             sf = a * np.exp(b * target)
-            print(f"  {target:<10.0%} {sf:<15.2f}")
+            note = ""
+            if target < min_observed_sparsity:
+                note = "(extrapolation)"
+            elif target > max_observed_sparsity:
+                note = "(extrapolation)"
+            print(f"  {target:<10.0%} {sf:<15.4f} {note:<20}")
 
         # Print calibration data summary by threshold
         print("\nCalibration data summary (per threshold):")
@@ -239,10 +280,11 @@ def exponential(sparsity, a, b):
             "a": float(a),
             "b": float(b),
             "r_squared": float(r_squared),
-            "rmse": float(rmse),
             "num_data_points": int(np.sum(valid_mask)),
             "total_samples": len(all_data_points),
             "calibration_type": "exponential",
+            "min_observed_sparsity": min_observed_sparsity,
+            "max_observed_sparsity": max_observed_sparsity,
         }
 
     def _enable_calibration_mode(self, modules: list[nn.Module]):
diff --git a/modelopt/torch/sparsity/attention_sparsity/config.py b/modelopt/torch/sparsity/attention_sparsity/config.py
index 42c8b0be8e..eed50b87af 100644
--- a/modelopt/torch/sparsity/attention_sparsity/config.py
+++ b/modelopt/torch/sparsity/attention_sparsity/config.py
@@ -337,6 +337,15 @@ class CalibrationConfig(ModeloptBaseConfig):
         ),
     )
 
+    fit_logspace: bool = ModeloptField(
+        default=False,
+        title="Fit in log space",
+        description=(
+            "If True, fit the exponential model in log space (minimizes relative error). "
+            "Recommended for diffusion models where scale_factors span many orders of magnitude."
+        ),
+    )
+
     cache_dir: str | None = ModeloptField(
         default=None,
         title="Cache directory",
diff --git a/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py b/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
index ec00eed0c2..1e2f3905e7 100644
--- a/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
+++ b/modelopt/torch/sparsity/attention_sparsity/methods/triton_skip_softmax.py
@@ -137,12 +137,29 @@ def _get_scale_factor(self) -> float | None:
         """
         if self.calibration_params and self.target_sparse_ratio:
             import math
+            import warnings
 
             params = self.calibration_params.get("prefill", {})
             a = params.get("a", 0)
             b = params.get("b", 0)
             target = self.target_sparse_ratio.get("prefill", 0.5)
             if a > 0 and b > 0:
+                # Warn if target is outside the calibrated range
+                min_s = params.get("min_observed_sparsity")
+                max_s = params.get("max_observed_sparsity")
+                if min_s is not None and target < min_s:
+                    warnings.warn(
+                        f"Target sparsity {target:.1%} is below the minimum observed "
+                        f"during calibration ({min_s:.1%}). The model is extrapolating "
+                        f"and runtime sparsity will likely be higher than the target.",
+                        stacklevel=2,
+                    )
+                elif max_s is not None and target > max_s:
+                    warnings.warn(
+                        f"Target sparsity {target:.1%} is above the maximum observed "
+                        f"during calibration ({max_s:.1%}). The model is extrapolating.",
+                        stacklevel=2,
+                    )
                 return a * math.exp(b * target)
         return None
 

From 22c5b850b13f4a28eed46340c80b039041e9bd0a Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Thu, 9 Apr 2026 04:03:18 +0000
Subject: [PATCH 17/21] Add the test case

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 .../attention_sparsity/kernels/__init__.py    |  22 +++-
 tests/_test_utils/torch/diffusers_models.py   |  93 ++++++++++++++++
 tests/examples/diffusers/test_sparsity.py     | 104 ++++++++++++++++++
 3 files changed, 216 insertions(+), 3 deletions(-)
 create mode 100644 tests/examples/diffusers/test_sparsity.py

diff --git a/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py b/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py
index e8111cb0fb..0cc4a202f5 100644
--- a/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py
+++ b/modelopt/torch/sparsity/attention_sparsity/kernels/__init__.py
@@ -16,10 +16,8 @@
 """Kernel integrations for sparse attention: Triton FA and diffusers/LTX backends."""
 
 import contextlib
+import threading
 
-# ---------------------------------------------------------------------------
-# Triton FA kernel re-exports (for HuggingFace LLM integration)
-# ---------------------------------------------------------------------------
 from modelopt.torch.kernels import IS_AVAILABLE, attention, register_triton_attention
 
 # ---------------------------------------------------------------------------
@@ -35,10 +33,28 @@
 with contextlib.suppress(ImportError, RuntimeError):
     from .ltx_triton_attention import register_ltx_triton_attention
 
+# ---------------------------------------------------------------------------
+# Thread-local flag for flash_skip_softmax's eager-attention context
+# ---------------------------------------------------------------------------
+_thread_local = threading.local()
+
+
+def set_skip_softmax_context(active: bool) -> None:
+    """Set whether skip-softmax softmax patching is active (thread-local)."""
+    _thread_local.skip_softmax_active = active
+
+
+def get_skip_softmax_context() -> bool:
+    """Return whether skip-softmax softmax patching is active."""
+    return getattr(_thread_local, "skip_softmax_active", False)
+
+
 __all__ = [
     "IS_AVAILABLE",
     "attention",
+    "get_skip_softmax_context",
     "register_diffusers_triton_attention",
     "register_ltx_triton_attention",
     "register_triton_attention",
+    "set_skip_softmax_context",
 ]
diff --git a/tests/_test_utils/torch/diffusers_models.py b/tests/_test_utils/torch/diffusers_models.py
index c2f9d9b3d7..352cc60d79 100644
--- a/tests/_test_utils/torch/diffusers_models.py
+++ b/tests/_test_utils/torch/diffusers_models.py
@@ -32,6 +32,16 @@
 except Exception:  # pragma: no cover - optional diffusers models
     Flux2Transformer2DModel = None
 
+try:
+    from diffusers.models.transformers import WanTransformer3DModel
+except Exception:  # pragma: no cover - optional diffusers models
+    WanTransformer3DModel = None
+
+try:
+    from diffusers.models.autoencoders import AutoencoderKLWan
+except Exception:  # pragma: no cover - optional diffusers models
+    AutoencoderKLWan = None
+
 import modelopt.torch.opt as mto
 
 
@@ -157,3 +167,86 @@ def df_modelopt_state_and_output_tester(model_ref, model_test):
     assert model_ref_state == model_test_state
 
     df_output_tester(model_ref, model_test)
+
+
+def get_tiny_wan22_transformer(**config_kwargs):
+    """Create a tiny WanTransformer3DModel for testing."""
+    if WanTransformer3DModel is None:
+        pytest.skip("WanTransformer3DModel is not available in this diffusers version.")
+
+    kwargs = {
+        "patch_size": (1, 2, 2),
+        "num_attention_heads": 2,
+        "attention_head_dim": 12,
+        "in_channels": 16,
+        "out_channels": 16,
+        "text_dim": 32,
+        "freq_dim": 256,
+        "ffn_dim": 32,
+        "num_layers": 2,
+        "cross_attn_norm": True,
+        "qk_norm": "rms_norm_across_heads",
+        "rope_max_seq_len": 32,
+    }
+    kwargs.update(**config_kwargs)
+    return WanTransformer3DModel(**kwargs)
+
+
+def get_tiny_wan22_vae(**config_kwargs):
+    """Create a tiny AutoencoderKLWan for testing."""
+    if AutoencoderKLWan is None:
+        pytest.skip("AutoencoderKLWan is not available in this diffusers version.")
+
+    kwargs = {
+        "base_dim": 3,
+        "z_dim": 16,
+        "dim_mult": [1, 1, 1, 1],
+        "num_res_blocks": 1,
+        "temperal_downsample": [False, True, True],
+    }
+    kwargs.update(**config_kwargs)
+    return AutoencoderKLWan(**kwargs)
+
+
+def create_tiny_wan22_pipeline_dir(tmp_path: Path) -> Path:
+    """Create and save a tiny Wan 2.2 (14B-style) pipeline to a directory.
+
+    Uses the same tiny config as diffusers' own Wan 2.2 tests:
+    - Transformer: 2 heads, 12 head_dim, 2 layers (hidden_dim=24)
+    - VAE: base_dim=3, z_dim=16
+    - Text encoder: hf-internal-testing/tiny-random-t5 (hidden_size=32)
+    - Dual transformer (14B style) with boundary_ratio=0.875
+
+    The saved directory can be loaded with ``WanPipeline.from_pretrained(path)``.
+    """
+    from diffusers import UniPCMultistepScheduler, WanPipeline
+    from transformers import AutoTokenizer, T5EncoderModel
+
+    torch.manual_seed(0)
+    vae = get_tiny_wan22_vae()
+
+    torch.manual_seed(0)
+    transformer = get_tiny_wan22_transformer()
+
+    torch.manual_seed(0)
+    transformer_2 = get_tiny_wan22_transformer()
+
+    scheduler = UniPCMultistepScheduler(
+        prediction_type="flow_prediction", use_flow_sigmas=True, flow_shift=3.0
+    )
+    text_encoder = T5EncoderModel.from_pretrained("hf-internal-testing/tiny-random-t5")
+    tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/tiny-random-t5")
+
+    pipe = WanPipeline(
+        transformer=transformer,
+        transformer_2=transformer_2,
+        vae=vae,
+        scheduler=scheduler,
+        text_encoder=text_encoder,
+        tokenizer=tokenizer,
+        boundary_ratio=0.875,
+    )
+
+    save_dir = tmp_path / "tiny_wan22"
+    pipe.save_pretrained(save_dir)
+    return save_dir
diff --git a/tests/examples/diffusers/test_sparsity.py b/tests/examples/diffusers/test_sparsity.py
new file mode 100644
index 0000000000..d33be1df68
--- /dev/null
+++ b/tests/examples/diffusers/test_sparsity.py
@@ -0,0 +1,104 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Tests for skip-softmax sparse attention on Wan 2.2 (examples/diffusers/sparsity/).
+
+Uses a tiny Wan 2.2 model (dual transformer, 2 layers, hidden_dim=24) created
+from scratch. Tests run the wan22_skip_softmax.py example script in baseline,
+triton-baseline, and raw-threshold modes.
+"""
+
+import pytest
+from _test_utils.examples.run_command import run_example_command
+from _test_utils.torch.diffusers_models import create_tiny_wan22_pipeline_dir
+
+EXAMPLE_PATH = "diffusers/sparsity"
+
+# Tiny inference settings — fast but exercises all code paths
+_TINY_ARGS = [
+    "--num-frames",
+    "5",
+    "--height",
+    "16",
+    "--width",
+    "16",
+    "--num-steps",
+    "2",
+    "--guidance-scale",
+    "1.0",
+    "--skip-first-last",
+    "0",
+    "--negative-prompt",
+    "",
+]
+
+
+@pytest.fixture(scope="session")
+def tiny_wan22_path(tmp_path_factory):
+    """Create a tiny Wan 2.2 pipeline saved to disk (session-scoped)."""
+    return str(create_tiny_wan22_pipeline_dir(tmp_path_factory.mktemp("tiny_wan22")))
+
+
+def test_wan22_baseline(tiny_wan22_path, tmp_path):
+    """Dense baseline — no sparsity, default diffusers attention backend."""
+    cmd = [
+        "python",
+        "wan22_skip_softmax.py",
+        "--model-path",
+        tiny_wan22_path,
+        "--baseline",
+        "--prompt",
+        "test",
+        "--output",
+        str(tmp_path / "baseline.mp4"),
+        *_TINY_ARGS,
+    ]
+    run_example_command(cmd, EXAMPLE_PATH)
+
+
+def test_wan22_triton_baseline(tiny_wan22_path, tmp_path):
+    """Triton kernel without skip-softmax (threshold=0, apples-to-apples)."""
+    cmd = [
+        "python",
+        "wan22_skip_softmax.py",
+        "--model-path",
+        tiny_wan22_path,
+        "--triton-baseline",
+        "--prompt",
+        "test",
+        "--output",
+        str(tmp_path / "triton_baseline.mp4"),
+        *_TINY_ARGS,
+    ]
+    run_example_command(cmd, EXAMPLE_PATH)
+
+
+def test_wan22_raw_threshold(tiny_wan22_path, tmp_path):
+    """Skip-softmax with a fixed raw threshold — no calibration needed."""
+    cmd = [
+        "python",
+        "wan22_skip_softmax.py",
+        "--model-path",
+        tiny_wan22_path,
+        "--raw-threshold",
+        "-5.0",
+        "--report-avg-sparsity",
+        "--prompt",
+        "test",
+        "--output",
+        str(tmp_path / "raw_threshold.mp4"),
+        *_TINY_ARGS,
+    ]
+    run_example_command(cmd, EXAMPLE_PATH)

From aa44a9dd6b3556a600593ed7d35e0f9d3780bf94 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Thu, 9 Apr 2026 04:07:36 +0000
Subject: [PATCH 18/21] Fixed the lint error

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 examples/diffusers/sparsity/README.md                     | 3 +--
 .../sparsity/attention_sparsity/calibration/calibrator.py | 8 ++++----
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/examples/diffusers/sparsity/README.md b/examples/diffusers/sparsity/README.md
index 71df2e1c5b..3b4c541a3f 100644
--- a/examples/diffusers/sparsity/README.md
+++ b/examples/diffusers/sparsity/README.md
@@ -1,6 +1,6 @@
 # Skip-Softmax Sparse Attention for Diffusion Models
 
-Skip-softmax sparse attention (BLASST, https://arxiv.org/pdf/2512.12087) skips KV
+Skip-softmax sparse attention (BLASST, <https://arxiv.org/pdf/2512.12087>) skips KV
 tiles whose attention scores are negligible during the FlashAttention computation,
 reducing FLOPs without retraining.
 
@@ -136,4 +136,3 @@ mtsa.sparsify(transformer, config, forward_loop)
 
 - **Calibration sparsity ratio**: The calibrated threshold goes through `log2(threshold) * sm_scale` conversion, producing `skip_threshold_log2` values in a different scale than raw thresholds. Needs investigation to ensure the fitted (a, b) parameters produce expected sparsity levels.
 - **14B dual transformer calibration**: Transformers are calibrated sequentially — transformer_2's calibration runs while transformer_1 is already sparsified, introducing asymmetric calibration conditions.
-
diff --git a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py
index 34d8987175..d3ed330325 100644
--- a/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py
+++ b/modelopt/torch/sparsity/attention_sparsity/calibration/calibrator.py
@@ -243,7 +243,9 @@ def exponential(sparsity, a, b):
         print(f"  Fitted a: {a:.6e}")
         print(f"  Fitted b: {b:.4f}")
         print(f"  R-squared: {r_squared:.6f}")
-        print(f"  Observed sparsity range: [{min_observed_sparsity:.1%}, {max_observed_sparsity:.1%}]")
+        print(
+            f"  Observed sparsity range: [{min_observed_sparsity:.1%}, {max_observed_sparsity:.1%}]"
+        )
         print(f"  Data points used: {int(np.sum(valid_mask))} / {len(all_data_points)}")
 
         # Show scale_factor for various target sparsities
@@ -253,9 +255,7 @@ def exponential(sparsity, a, b):
         for target in [0.3, 0.4, 0.5, 0.6, 0.7, 0.8]:
             sf = a * np.exp(b * target)
             note = ""
-            if target < min_observed_sparsity:
-                note = "(extrapolation)"
-            elif target > max_observed_sparsity:
+            if target < min_observed_sparsity or target > max_observed_sparsity:
                 note = "(extrapolation)"
             print(f"  {target:<10.0%} {sf:<15.4f} {note:<20}")
 

From e5293dec92d9662040aba4a4a66fa2de7ab38be9 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Wed, 8 Apr 2026 21:35:38 -0700
Subject: [PATCH 19/21] Updated the README

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 examples/diffusers/sparsity/README.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/examples/diffusers/sparsity/README.md b/examples/diffusers/sparsity/README.md
index 3b4c541a3f..8e6c69112b 100644
--- a/examples/diffusers/sparsity/README.md
+++ b/examples/diffusers/sparsity/README.md
@@ -10,7 +10,8 @@ Two modes are supported:
 - **Calibrated threshold** — an exponential model
   (`scale_factor = a * exp(b * target_sparsity)`) is calibrated once via the
   Triton calibration kernel, then the target sparsity can be adjusted at runtime
-  without recalibration.
+  without recalibration. Log-space fitting (`fit_logspace=True`) is recommended
+  for diffusion models where scale_factors span many orders of magnitude.
 
 ## Supported Models
 
@@ -43,6 +44,7 @@ python wan22_skip_softmax.py \
 
 # Report runtime sparsity (per-layer tile skip ratios)
 python wan22_skip_softmax.py \
+    --model-path /path/to/Wan2.2-T2V-A14B-Diffusers \
     --raw-threshold -0.7 --report-avg-sparsity \
     --prompt "A cat playing piano" --output out.mp4
 ```
@@ -81,6 +83,7 @@ mtsa.sparsify(transformer, config, forward_loop)
        │                        - Per-program output buffers (no atomic contention)
        │                        - Python-side reduction: sum across programs
        ├─ Fit: scale_factor = a * exp(b * sparsity)
+       │    └─ fit_logspace=True: fits in log space (minimizes relative error)
        └─ Apply a, b to all modules
             └─ Inference: threshold = scale_factor / seq_k
 ```
@@ -114,13 +117,13 @@ mtsa.sparsify(transformer, config, forward_loop)
 | File | Role |
 |------|------|
 | `calibrate.py` | Orchestrates calibration. Skips RULER dataset when user provides `forward_loop` (diffusion models). Applies fitted (a, b) to all modules. |
-| `calibrator.py` | `DynamicThresholdCalibrator`: collects (scale_factor, sparsity) pairs via Triton calibration kernel, fits exponential model `scale_factor = a * exp(b * sparsity)`. |
+| `calibrator.py` | `DynamicThresholdCalibrator`: collects (scale_factor, sparsity) pairs via Triton calibration kernel, fits exponential model `scale_factor = a * exp(b * sparsity)`. Supports `fit_logspace=True` for log-space fitting (recommended for diffusion models). |
 
 ### Config & Conversion
 
 | File | Role |
 |------|------|
-| `config.py` | `SparseAttentionAttributeConfig` with `skip_softmax_threshold`, `skip_softmax_raw_threshold`, calibration settings. |
+| `config.py` | `SparseAttentionAttributeConfig` with `skip_softmax_threshold`, `skip_softmax_raw_threshold`, calibration settings. `CalibrationConfig` with `fit_logspace` field. |
 | `conversion.py` | `_register_diffusers_backends_if_needed()` auto-registers Triton backends on `sparsify()`. |
 | `sparse_attention.py` | `SparseAttentionModule` wrapper — delegates to method's `get_sparse_context()`. |
 
@@ -134,5 +137,5 @@ mtsa.sparsify(transformer, config, forward_loop)
 
 ## Known Issues
 
-- **Calibration sparsity ratio**: The calibrated threshold goes through `log2(threshold) * sm_scale` conversion, producing `skip_threshold_log2` values in a different scale than raw thresholds. Needs investigation to ensure the fitted (a, b) parameters produce expected sparsity levels.
 - **14B dual transformer calibration**: Transformers are calibrated sequentially — transformer_2's calibration runs while transformer_1 is already sparsified, introducing asymmetric calibration conditions.
+- **Minimum achievable sparsity**: Even the strictest threshold may yield 30-40% sparsity on diffusion models (many tiles are inherently negligible). Targets below this floor cause extrapolation; an inference-time warning is emitted.

From 40d61dd8a39f7571933a25f7f9dc1b6e58b19063 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Wed, 8 Apr 2026 22:16:51 -0700
Subject: [PATCH 20/21] Update the test case

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 .../test_kernel_backends.py                   | 56 -------------------
 1 file changed, 56 deletions(-)

diff --git a/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py b/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
index b8685b8410..b9a753e395 100644
--- a/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
+++ b/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
@@ -71,57 +71,6 @@ def test_set_and_get(self):
         assert get_skip_softmax_context() is False
 
 
-# ---------------------------------------------------------------------------
-# Tests: diffusers eager attention
-# ---------------------------------------------------------------------------
-
-
-class TestDiffusersEagerAttention:
-    @pytest.fixture(autouse=True)
-    def _setup(self):
-        with patch.dict(sys.modules, _mock_diffusers()):
-            from modelopt.torch.sparsity.attention_sparsity.kernels.diffusers_eager_attention import (
-                _diffusers_eager_attention,
-                get_skip_softmax_attention_backend,
-                register_diffusers_eager_attention,
-            )
-
-            self._fn = _diffusers_eager_attention
-            self._register = register_diffusers_eager_attention
-            self._get_backend = get_skip_softmax_attention_backend
-
-            import modelopt.torch.sparsity.attention_sparsity.kernels.diffusers_eager_attention as mod
-
-            mod._BACKEND_REGISTERED = False
-            yield
-
-    def test_basic_shape(self):
-        q = torch.randn(2, 8, 4, 16)
-        assert self._fn(q, q, q).shape == (2, 8, 4, 16)
-
-    def test_cross_attention(self):
-        q = torch.randn(1, 4, 2, 8)
-        k = torch.randn(1, 12, 2, 8)
-        assert self._fn(q, k, k).shape == (1, 4, 2, 8)
-
-    def test_causal(self):
-        q = torch.randn(1, 4, 1, 8)
-        assert self._fn(q, q, q, is_causal=True).shape == (1, 4, 1, 8)
-
-    def test_gqa(self):
-        q = torch.randn(1, 4, 8, 16)
-        k = torch.randn(1, 4, 2, 16)
-        assert self._fn(q, k, k, enable_gqa=True).shape == (1, 4, 8, 16)
-
-    def test_register_idempotent(self):
-        self._register()
-        self._register()
-
-    def test_get_backend_before_register_raises(self):
-        with pytest.raises(RuntimeError, match="not registered"):
-            self._get_backend()
-
-
 # ---------------------------------------------------------------------------
 # Tests: diffusers triton attention
 # ---------------------------------------------------------------------------
@@ -194,15 +143,10 @@ def test_with_diffusers_model(self):
 
         with (
             patch.dict(sys.modules, {"diffusers.models.modeling_utils": mock_utils}),
-            patch(
-                "modelopt.torch.sparsity.attention_sparsity.kernels.register_diffusers_eager_attention",
-                MagicMock(),
-            ) as mock_eager,
             patch(
                 "modelopt.torch.sparsity.attention_sparsity.kernels.register_diffusers_triton_attention",
                 MagicMock(),
             ) as mock_triton,
         ):
             _register_diffusers_backends_if_needed(mock_mixin())
-            mock_eager.assert_called_once()
             mock_triton.assert_called_once()

From 3845b47c16b4468e6f098d6799f2a7b2c4fe2a73 Mon Sep 17 00:00:00 2001
From: Jingyu Xin <jingyux@nvidia.com>
Date: Thu, 9 Apr 2026 06:32:02 +0000
Subject: [PATCH 21/21] Fixed the CICD

Signed-off-by: Jingyu Xin <jingyux@nvidia.com>
---
 .../torch/sparsity/attention_sparsity/test_kernel_backends.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py b/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
index b9a753e395..3dd94ccee4 100644
--- a/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
+++ b/tests/unit/torch/sparsity/attention_sparsity/test_kernel_backends.py
@@ -20,7 +20,6 @@
 from unittest.mock import MagicMock, patch
 
 import pytest
-import torch
 import torch.nn as nn
 
 
@@ -82,6 +81,7 @@ def _setup(self):
         mocks = _mock_diffusers()
         mk = types.ModuleType("modelopt.torch.kernels")
         mk.attention = lambda q, k, v, **kw: q
+        mk.attention_calibrate = None
         mk.IS_AVAILABLE = True
         mk.register_triton_attention = None
         mocks["modelopt.torch.kernels"] = mk