Revert "wip(cute): B-fix attempt — consume-gate DCE + post-attn-LN dispatch ops"

Natfii · Natfii · commit 3ffcf87401c5 · 2026-04-26T12:11:54.000-04:00
This reverts commit 514b88c.
diff --git a/vllm/nvllm/models/qwen3_5.py b/vllm/nvllm/models/qwen3_5.py
@@ -463,50 +463,17 @@ def _ct_mark(label: str) -> None:
                 positions=positions,
             )
             _ct_mark("self_attn")
-            # 2026-04-26 (B-fix): the prior `if getattr(impl, "_fusion_active",
-            # False)` Python-bool gate was dead-eliminated by torch.compile —
-            # at trace time `_fusion_active` was False (impl __init__ default),
-            # so dynamo specialised the if-branch as dead and the captured
-            # graph always ran the else fall-through. Empirically verified
-            # via /root/.cache/vllm/torch_compile_cache/<hash>/.../
-            # computation_graph.py: the consume `.copy_()` calls were absent
-            # AND the legacy Python o_proj path was always present.
-            #
-            # Replace with an opaque op (cute_attn_consume) that always runs
-            # in the captured graph and dispatches at runtime via
-            # `impl._fusion_active_signal` (a 0-dim int32 tensor mutated
-            # inside the unified_attention opaque op, where dynamo can't
-            # see the change). When the signal == 0, the op no-ops; when
-            # it's > 0 (β-coop fired), it copies β-coop's outputs into
-            # self_attention_output and residual.
-            #
-            # residual_buf and gate_buf are passed as PHANTOM inputs: they
-            # are not used inside the op body, but their presence forces
-            # a data dependency on cute_residual_mirror's output, which
-            # otherwise gets DCE'd despite mutates_args (verified: only
-            # mutates_args is NOT enough to survive DCE if no graph op
-            # reads the mutated tensor).
-            if impl is not None and getattr(impl, "_fusion_bound", False):
-                # Consistent with the cute_residual_mirror gate above
-                # (_fusion_bound is set in attach_fusion, stable at trace
-                # time — dynamo's specialization on it is correct because
-                # it's a one-time setup flag, not a per-step runtime flag).
-                # The op uses _CUTE_ATTN_REGISTRY[layer_name] internally to
-                # read impl._phase_e_use_beta_coop at runtime — Python attr
-                # access only, no .item() / no CUDA sync, safe under graph
-                # capture (verified failure mode 2026-04-26 from .item():
-                # cudaErrorStreamCaptureInvalidated).
-                torch.ops.vllm.cute_attn_consume(
-                    self_attention_output,
-                    residual,
-                    impl.rmsnorm_output,
-                    impl.residual_output,
-                    impl.residual_buf,
-                    impl.gate_buf,
-                    self.self_attn.attn.layer_name,
-                )
-            hidden_states = self_attention_output
-            _ct_mark("attn_consume_or_legacy")
+            if impl is not None and getattr(impl, "_fusion_active", False):
+                # Kernel already did gate*attn, W_O GEMV, residual+RMSNorm.
+                self_attention_output[:nat].copy_(impl.rmsnorm_output[:nat])
+                if nat < num_tokens:
+                    self_attention_output[nat:].zero_()
+                residual[:nat].copy_(impl.residual_output[:nat])
+                hidden_states = self_attention_output
+                _ct_mark("attn_consume")
+            else:
+                hidden_states = self_attention_output
+                _ct_mark("attn_legacy")
         else:
             raise ValueError("Invalid layer_type")
 
@@ -520,35 +487,13 @@ def _ct_mark(label: str) -> None:
                     self.attn_layer_scale.to(hidden_states.dtype) + 1
                 )
 
-        # 2026-04-26 (B-fix): post_attn_LN dispatch via opaque op for full
-        # attention layers (replacing the dead-eliminated Python-bool gate).
-        # The prior `if not getattr(impl, "_fusion_active", False)` was
-        # specialised to always-run by dynamo (because trace-time
-        # `_fusion_active = False`, so `not False = True`). The captured
-        # graph ran post_attn_LN unconditionally — fine in dual-fire (β-coop's
-        # rmsnorm_output was unused anyway, Python pipeline did the work)
-        # but in solo it operated over uninitialised self_attention_output
-        # because β-coop doesn't expose Phase A to the framework `output`
-        # parameter.
-        #
-        # Linear-attention layers have impl=None and no fusion signal, so
-        # they keep the plain Python module call (no compile fragility there
-        # — the dead-elim only bites paths that depend on a runtime-mutated
-        # Python attribute).
-        if impl is not None and getattr(impl, "_fusion_bound", False):
-            torch.ops.vllm.cute_post_attn_ln_dispatch(
-                hidden_states,
-                residual,
-                self.post_attention_layernorm.weight,
-                float(self.post_attention_layernorm.variance_epsilon),
-                self.self_attn.attn.layer_name,
-            )
-            _ct_mark("post_attn_ln_dispatch")
-        else:
+        if not getattr(impl, "_fusion_active", False):
             hidden_states, residual = self.post_attention_layernorm(
                 hidden_states, residual
             )
             _ct_mark("post_attn_ln")
+        else:
+            _ct_mark("post_attn_skip")
 
         # Phase E β-lite consume. When the CuTe backend launched the
         # β-lite dispatch inside its forward, the MLP kernel's ε epilogue
diff --git a/vllm/v1/attention/backends/cute_paged/_backend.py b/vllm/v1/attention/backends/cute_paged/_backend.py
@@ -336,20 +336,6 @@ def _preallocate_fusion_buffers(
             max_num_seqs, hidden_dim, dtype=torch.bfloat16, device=device
         )
 
-        # 2026-04-26 (B-fix): runtime signal tensor for the consume-or-postln
-        # dispatch in qwen3_5.py. 0-dim int32 written inside impl.forward
-        # (which is wrapped in the unified_attention opaque op, invisible
-        # to dynamo). Read at runtime by `cute_attn_consume` and
-        # `cute_post_attn_ln_dispatch` ops via .item(). Value: 0 = fusion
-        # didn't fire (run Python o_proj/post_attn_LN normally), N > 0 =
-        # fusion fired with N tokens (use β-coop outputs, skip post_attn_LN).
-        # This replaces the dead-eliminated `getattr(impl, "_fusion_active",
-        # False)` Python-bool gates with a tensor-based signal that survives
-        # torch.compile specialization.
-        self._fusion_active_signal = torch.zeros(
-            (), dtype=torch.int32, device=device
-        )
-
         # Phase D MLP fusion buffers. Shape-defining axes (`slice_ctas`
         # for `mlp_partial_fp32`, `num_k_tiles` for `mlp_arrival_count`)
         # are both kernel-side constants resolved inside
@@ -669,15 +655,6 @@ def _resolve_fusion_weights(self) -> None:
             return
 
         self._fusion_bound = True
-        # 2026-04-26 (B-fix): register self in the attn-consume registry so
-        # cute_attn_consume / cute_post_attn_ln_dispatch can look up the impl
-        # at runtime via layer_name string. Avoids passing impl as a custom-op
-        # arg (not supported) AND avoids reading a 0-dim tensor signal via
-        # .item() (causes cudaErrorStreamCaptureInvalidated under graph capture).
-        from vllm.v1.attention.backends.cute_paged._mlp_op import (
-            _CUTE_ATTN_REGISTRY,
-        )
-        _CUTE_ATTN_REGISTRY[self._fusion_prefix] = self
         logger.info(
             "CuTe fusion resolved: layer=%s wo_weight=%s rmsnorm_gamma=%s",
             self._fusion_prefix,
@@ -1043,23 +1020,6 @@ def forward(
         fits_buffer = num_actual_tokens <= getattr(self, "_fusion_max_num_seqs", 0)
         self._fusion_active = self._fusion_bound and is_decode_only and fits_buffer
         use_fusion = self._fusion_active
-        # 2026-04-26 (B-fix): per-step reset for the consume gate. Both flags
-        # are read inside opaque op bodies (cute_attn_consume and
-        # cute_post_attn_ln_dispatch) at runtime via Python attribute access
-        # — keyed off impl from _CUTE_ATTN_REGISTRY by layer_name. Resetting
-        # here ensures the gate reflects THIS forward call (β-coop may not
-        # fire even when fusion is bound, e.g. predicate fails or kernel
-        # falls back to β-lite/paged via the except handler below).
-        #
-        # _fusion_active_signal stays as a 0-dim tensor for debug visibility
-        # but is NOT read inside the consume ops anymore — switching to
-        # Python attr access avoids the .item() host-device sync that broke
-        # CUDA graph capture (cudaErrorStreamCaptureInvalidated 2026-04-26).
-        # Kept commented-out for the moment so the .fill_() side effect can
-        # be re-enabled if a future debug session wants the visibility back.
-        self._phase_e_use_beta_coop = False
-        # if hasattr(self, "_fusion_active_signal"):
-        #     self._fusion_active_signal.fill_(0)
         # --- PHASE D2 DISABLED (commented, not deleted — Phase B/C debug may
         # need this reset back) ---
         # Pre-D2, the MLP fusion launch was an attention-side side effect
@@ -1350,14 +1310,6 @@ def forward(
                     )
                 self._phase_e_consumed = True
                 self._phase_e_use_beta_coop = True
-                # 2026-04-26 (B-fix): the consume gate now reads
-                # `impl._phase_e_use_beta_coop` (Python attr) inside the
-                # opaque op body via _CUTE_ATTN_REGISTRY lookup — no .item()
-                # call, no host-device sync, CUDA-graph-safe. The tensor
-                # signal `.fill_(nat)` below is kept commented (not deleted)
-                # so it can be re-enabled if a future debug session wants
-                # tensor-side visibility into β-coop firing decisions.
-                # self._fusion_active_signal.fill_(nat)
                 # 2026-04-26: ENV-GATED dump for off-line math verification.
                 # CUTE_DUMP_TENSORS=1 enables; bounded to first 3 decode
                 # steps × 16 full-attn layers so disk doesn't bloat. Files
diff --git a/vllm/v1/attention/backends/cute_paged/_mlp_op.py b/vllm/v1/attention/backends/cute_paged/_mlp_op.py
@@ -46,17 +46,6 @@
 # The op body reads from this dict at runtime (not at trace time).
 _CUTE_MLP_REGISTRY: dict[str, "CutePagedAttentionImpl"] = {}
 
-# 2026-04-26 (B-fix): attn-consume registry, populated by
-# `CutePagedAttentionImpl.attach_fusion`. Same impl object as
-# _CUTE_MLP_REGISTRY but keyed by ATTENTION layer name (e.g.
-# `language_model.model.layers.3.self_attn.attn`), not the MLP key
-# used by cute_phase_e_dispatch. Allows cute_attn_consume and
-# cute_post_attn_ln_dispatch to look up the impl and read its
-# Python-side flags at runtime — avoids the .item() host-device sync
-# on a 0-dim tensor signal (which raises cudaErrorStreamCaptureInvalidated
-# under CUDA graph capture, verified 2026-04-26).
-_CUTE_ATTN_REGISTRY: dict[str, "CutePagedAttentionImpl"] = {}
-
 
 def _cute_mlp_forward_impl(
     x: torch.Tensor,
@@ -320,169 +309,3 @@ def _cute_residual_mirror_fake(
     mutates_args=["residual_buf"],
     fake_impl=_cute_residual_mirror_fake,
 )
-
-
-# --- 2026-04-26: cute_attn_consume + cute_post_attn_ln_dispatch ----------------
-# B-fix: replace the dead-eliminated Python `if _fusion_active` consume branch
-# at qwen3_5.py:466-476 and the dead-eliminated `if not _fusion_active`
-# post_attention_layernorm gate at qwen3_5.py:490-496.
-#
-# WHY needed: the captured FX graph (verified 2026-04-26 via
-# /root/.cache/vllm/torch_compile_cache/<hash>/rank_0_0/backbone/computation_graph.py)
-# specialized BOTH gates at trace time on `_fusion_active = False` (the impl's
-# __init__ default) — dynamo can't see the runtime mutation that happens inside
-# the unified_attention opaque op. Result: the consume copy was DCE'd, the
-# legacy Python o_proj + post_attn_LN ALWAYS ran, β-coop's rmsnorm_output /
-# residual_output were never read by the captured graph. In dual-fire this
-# happened to produce coherent output because paged populated `output` with
-# Phase A and the Python pipeline applied o_proj + post_attn_LN over it. In
-# solo (paged gated off, β-coop only), `output` stayed uninitialised and
-# Python applied o_proj over junk → gibberish.
-#
-# Fix: route the consume / postln decision through a runtime tensor signal
-# (`impl._fusion_active_signal`, 0-dim int32) that's mutated INSIDE the
-# unified_attention op (invisible to dynamo's specialization) and read at
-# runtime via .item() inside these opaque ops. Both ops always run, dispatch
-# at runtime via the signal value:
-#   signal == 0 : non-fusion mode (β-coop didn't fire). consume no-ops;
-#                 postln applies the fused-residual RMSNorm in-place over
-#                 the Python o_proj's wo_out.
-#   signal > 0  : fusion mode (β-coop fired with N=signal tokens). consume
-#                 copies β-coop's rmsnorm_output → self_attention_output and
-#                 residual_output → residual; postln no-ops (β-coop's Phase
-#                 1C already produced LN(post_input_LN_residual + wo_out)·γ).
-#
-# residual_buf and gate_buf are passed to consume as PHANTOM inputs (not
-# read inside the body) — their sole purpose is to give the cute_residual_mirror
-# and cute_residual_mirror(gate_buf, ...) ops observable downstream readers
-# in the captured graph, which prevents dynamo's DCE from dropping them
-# (verified empirically that mutates_args alone is NOT sufficient against
-# DCE — the ops were dead-eliminated despite mutates_args=["residual_buf"]
-# until a downstream reader was added).
-
-
-def _cute_attn_consume_impl(
-    self_attention_output: torch.Tensor,  # mutated [num_tokens, hidden_dim] BF16
-    residual: torch.Tensor,                # mutated [num_tokens, hidden_dim] BF16
-    rmsnorm_output: torch.Tensor,          # impl.rmsnorm_output [max_num_seqs, hidden_dim] BF16
-    residual_output: torch.Tensor,         # impl.residual_output [max_num_seqs, hidden_dim] BF16
-    residual_buf: torch.Tensor,            # phantom for cute_residual_mirror dep
-    gate_buf: torch.Tensor,                # phantom for gate-mirror dep
-    layer_name: str,                       # registry key into _CUTE_ATTN_REGISTRY
-) -> None:
-    """If β-coop fired this step: copy its outputs into model-side tensors.
-
-    Reads `impl._phase_e_use_beta_coop` (Python attr) at runtime via
-    `_CUTE_ATTN_REGISTRY[layer_name]` — no .item() call, no CUDA sync,
-    safe under CUDA graph capture. Reset to False at top of impl.forward,
-    set to True only on successful β-coop launch — so True ⇔ β-coop wrote
-    rmsnorm_output and residual_output for THIS forward call.
-    """
-    impl = _CUTE_ATTN_REGISTRY.get(layer_name)
-    # 2026-04-26 (B-fix v2): gate on `_fusion_bound` (set once at
-    # attach_fusion, stable across warmup + runtime) rather than
-    # `_phase_e_use_beta_coop` (set per-step inside impl.forward — not
-    # consistently True at warmup capture time, so the captured segment
-    # would skip the consume kernels and replay would never fill
-    # self_attention_output from β-coop's outputs). With _fusion_bound:
-    # capture always sees True for fusion-bound full-attn layers,
-    # consume kernels always captured. Cost: if β-coop ever fails to
-    # fire at runtime (e.g. predicate fails), consume reads stale
-    # impl.rmsnorm_output. Mitigated by the predicate hard-gate landed
-    # in the prior commit which prevents silent β-coop fallthrough on
-    # cooperative-launch-too-large.
-    if impl is None or not getattr(impl, "_fusion_bound", False):
-        # Non-fusion / non-bound: leave self_attention_output as-is (Python
-        # o_proj already wrote it) and residual untouched.
-        return
-    # Fusion mode: β-coop's Phase 1C produced these. Bound by buffer capacity
-    # defensively (matches the original Python consume branch).
-    nat = min(self_attention_output.shape[0], rmsnorm_output.shape[0])
-    self_attention_output[:nat].copy_(rmsnorm_output[:nat])
-    if nat < self_attention_output.shape[0]:
-        # Match the prior `if nat < num_tokens: self_attention_output[nat:].zero_()`
-        # — keeps unused rows deterministic across decode steps.
-        self_attention_output[nat:].zero_()
-    residual[:nat].copy_(residual_output[:nat])
-
-
-def _cute_attn_consume_fake(
-    self_attention_output: torch.Tensor,
-    residual: torch.Tensor,
-    rmsnorm_output: torch.Tensor,
-    residual_output: torch.Tensor,
-    residual_buf: torch.Tensor,
-    gate_buf: torch.Tensor,
-    layer_name: str,
-) -> None:
-    return
-
-
-direct_register_custom_op(
-    op_name="cute_attn_consume",
-    op_func=_cute_attn_consume_impl,
-    # Both self_attention_output and residual are mutated when fusion fires;
-    # the phantom inputs are read-only.
-    mutates_args=["self_attention_output", "residual"],
-    fake_impl=_cute_attn_consume_fake,
-)
-
-
-def _cute_post_attn_ln_dispatch_impl(
-    hidden_states: torch.Tensor,  # mutated [num_tokens, hidden_dim] BF16
-    residual: torch.Tensor,        # mutated [num_tokens, hidden_dim] BF16
-    weight: torch.Tensor,          # post_attention_layernorm.weight [hidden_dim] BF16
-    rmsnorm_eps: float,
-    layer_name: str,               # registry key into _CUTE_ATTN_REGISTRY
-) -> None:
-    """If β-coop did NOT fire: apply fused-residual post_attention_layernorm.
-
-    Mirrors `_forward_static_with_residual` in vllm/nvllm/layers/layernorm.py:
-        combined = hidden_states + residual
-        residual = combined
-        x = combined.float()
-        var = x.pow(2).mean(dim=-1, keepdim=True)
-        x = x * torch.rsqrt(var + eps)
-        x = x * (1.0 + weight.float())
-        hidden_states = x.to(combined.dtype)
-
-    When β-coop fired, its Phase 1C already produced this exact output into
-    hidden_states via cute_attn_consume above, and residual already holds
-    residual_post_attn — skip to avoid double-LN.
-
-    Reads `impl._phase_e_use_beta_coop` (Python attr) — no .item() needed,
-    CUDA-graph-safe. See cute_attn_consume docstring for the gate semantics.
-    """
-    impl = _CUTE_ATTN_REGISTRY.get(layer_name)
-    # See cute_attn_consume docstring above for why we gate on _fusion_bound
-    # rather than _phase_e_use_beta_coop. Symmetric: when consume fires,
-    # post_attn_LN must skip; when consume no-ops, post_attn_LN must apply.
-    if impl is not None and getattr(impl, "_fusion_bound", False):
-        # Fusion mode: β-coop already did post_attn_LN. Skip.
-        return
-    # Non-fusion mode: replicate _forward_static_with_residual in-place.
-    combined = hidden_states + residual
-    residual.copy_(combined)
-    x = combined.float()
-    var = x.pow(2).mean(dim=-1, keepdim=True)
-    x = x * torch.rsqrt(var + rmsnorm_eps)
-    x = x * (1.0 + weight.float())
-    hidden_states.copy_(x.to(combined.dtype))
-
-
-def _cute_post_attn_ln_dispatch_fake(
-    hidden_states: torch.Tensor,
-    residual: torch.Tensor,
-    weight: torch.Tensor,
-    rmsnorm_eps: float,
-    layer_name: str,
-) -> None:
-    return
-
-
-direct_register_custom_op(
-    op_name="cute_post_attn_ln_dispatch",
-    op_func=_cute_post_attn_ln_dispatch_impl,
-    mutates_args=["hidden_states", "residual"],
-    fake_impl=_cute_post_attn_ln_dispatch_fake,
-)