Navi-AI-Lab
diff --git a/‎tests/v1/cute_paged/test_uber_kernel_multi_layer.py‎
Lines changed: 114 additions & 0 deletions b/‎tests/v1/cute_paged/test_uber_kernel_multi_layer.py‎
Lines changed: 114 additions & 0 deletions
diff --git a/‎vllm/nvllm/models/qwen3_5.py‎
Lines changed: 14 additions & 72 deletions b/‎vllm/nvllm/models/qwen3_5.py‎
Lines changed: 14 additions & 72 deletions
@@ -0,0 +1,114 @@
+"""L3 multi-layer test: verifies layer-boundary semantics post-C1.5.
+
+Catches:
+- Phase 4 not adding mlp_out (audit Finding 1) — layer N+1's input_LN does the sum
+- Per-layer input_layernorm fires unconditionally (no skip-op fall-through)
+- F.1 layer-LN bake plumbing (skip-op, attach methods, flags) is gone
+- run_beta_coop_full no longer takes Phase 4 / next-LN parameters
+- cute_phase_e_dispatch consume branch reads mlp_output, not next_hidden_scratch
+
+Strategy: pure source-text inspection via `inspect.getsource`. The full
+kernel-level diff is covered by L4 (gsm8k); this test catches the
+structural class. No CUDA, no kernel launch — runs anywhere.
+"""
+import inspect
+
+
+def test_qwen35_layer_forward_runs_input_layernorm_unconditionally():
+    """qwen3_5.py: input_LN gate must collapse to unconditional run.
+
+    Post-C1.5 the non-first-layer branch of Qwen3_5DecoderLayer.forward
+    must call self.input_layernorm(...) directly — no skip-op detour
+    via cute_phase_e_skip_input_layernorm.
+    """
+    from vllm.nvllm.models import qwen3_5
+    src = inspect.getsource(qwen3_5.Qwen3_5DecoderLayer.forward)
+    assert "cute_phase_e_skip_input_layernorm" not in src, (
+        "F.1 skip-op call site still present in layer forward. "
+        "Should be deleted in C1.5."
+    )
+    assert "self.input_layernorm(hidden_states, residual)" in src, (
+        "Expected unconditional self.input_layernorm(hidden_states, residual) "
+        "call in non-first-layer branch."
+    )
+
+
+def test_no_attach_input_layernorm_loops_in_model_init():
+    """qwen3_5.py source must drop attach_*_layernorm loops.
+
+    Both attach_input_layernorm and attach_next_input_layernorm loops
+    are gone — the F.1 cross-layer bake plumbing they enabled is gone.
+
+    We grep the file directly because Qwen3_5Model.__init__ is replaced
+    by @support_torch_compile, so inspect.getsource(Qwen3_5Model.__init__)
+    returns the wrapper, not the class body.
+    """
+    from vllm.nvllm.models import qwen3_5
+    with open(qwen3_5.__file__, "r") as f:
+        src = f.read()
+    assert "attach_input_layernorm" not in src, (
+        "attach_input_layernorm reference still present in qwen3_5.py. "
+        "C1.5 must delete the attach loop and any Phase F.1 plumbing. "
+        "(check for impl.attach_input_layernorm(...) call in Qwen3_5Model.__init__)"
+    )
+    assert "attach_next_input_layernorm" not in src, (
+        "attach_next_input_layernorm reference still present in qwen3_5.py. "
+        "C1.5 must delete the attach loop and any Phase F.1 plumbing. "
+        "(check for impl.attach_next_input_layernorm(...) call in Qwen3_5Model.__init__)"
+    )
+
+
+def test_skip_op_deleted():
+    """cute_phase_e_skip_input_layernorm op must be deleted entirely.
+
+    Both the impl/fake functions and the direct_register_custom_op
+    registration must be gone from _mlp_op.py.
+    """
+    from vllm.v1.attention.backends.cute_paged import _mlp_op
+    src = inspect.getsource(_mlp_op)
+    assert 'op_name="cute_phase_e_skip_input_layernorm"' not in src, (
+        "cute_phase_e_skip_input_layernorm op still registered. "
+        "C1.5 must delete the op registration and the impl/fake functions."
+    )
+
+
+def test_phase_4_deleted_from_run_beta_coop_full():
+    """Phase 4 args must be dropped from run_beta_coop_full's signature.
+
+    The kernel returns at the end of Phase 3 (MLP write). The next-layer
+    input_LN runs from Python at every layer entry instead of being baked
+    into the previous layer's epilogue.
+    """
+    from vllm.v1.attention.backends.cute_paged import phase_e_kernel
+    src = inspect.getsource(
+        phase_e_kernel.PhaseE_Beta_Kernel.run_beta_coop_full
+    )
+    assert "next_input_layernorm_gamma" not in src, (
+        "Phase 4 arg next_input_layernorm_gamma still present in "
+        "run_beta_coop_full. C1.5 must drop it."
+    )
+    assert "emit_next_layernorm" not in src, (
+        "Phase 4 arg emit_next_layernorm still present in "
+        "run_beta_coop_full. C1.5 must drop it."
+    )
+
+
+def test_dispatch_op_consumes_mlp_output_not_next_hidden_scratch():
+    """cute_phase_e_dispatch consume branch must read mlp_output.
+
+    Pre-C1.5 the consume branch read impl.next_hidden_scratch (the
+    Phase-4-baked next-layer input_LN output). Post-C1.5 it reads
+    impl.mlp_output (raw post-MLP hidden) and the next layer's
+    input_LN runs from Python.
+    """
+    from vllm.v1.attention.backends.cute_paged import _mlp_op
+    src = inspect.getsource(_mlp_op)
+    assert "next_hidden_scratch" not in src, (
+        "cute_phase_e_dispatch still references next_hidden_scratch. "
+        "C1.5 must update consume branch to read from mlp_output."
+    )
+    assert "impl.mlp_output[:nat]" in src, (
+        "Expected cute_phase_e_dispatch consume branch to read "
+        "impl.mlp_output[:nat] for hidden_out. C1.5 must keep this read "
+        "active — see _mlp_op.py consume branch."
+    )
@@ -413,32 +413,16 @@ def _ct_mark(label: str) -> None:
             _ct_t = _now
 
         if residual is None:
-            # First-layer case: no residual to add. Phase F.1 skip-op only
-            # applies when there's a residual + we're past layer 0.
+            # First-layer case: no residual to add.
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
             _ct_mark("input_ln_first")
         else:
-            # Phase F.1: use opaque skip op if MLP fusion is attached on
-            # THIS layer (attach-time constant, trace-safe). Op body reads
-            # impl._phase_e_skip_next_ln at runtime → passes through when
-            # the previous layer's β ε epilogue already ran input_layernorm.
-            _mlp_layer_name = getattr(self.mlp, "_cute_layer_name", None)
-            if _mlp_layer_name is not None:
-                out_x = torch.empty_like(hidden_states)
-                out_residual = torch.empty_like(residual)
-                _ct_mark("ln_skip_alloc")
-                torch.ops.vllm.cute_phase_e_skip_input_layernorm(
-                    hidden_states, residual, out_x, out_residual,
-                    _mlp_layer_name,
-                )
-                hidden_states, residual = out_x, out_residual
-                _ct_mark("ln_skip_op")
-            else:
-                hidden_states, residual = self.input_layernorm(
-                    hidden_states, residual
-                )
-                _ct_mark("input_ln")
+            # C1.5: Phase F.1 skip-op deleted. The previous layer's β-coop
+            # kernel ends at Phase 3 (no input_LN bake), so every layer
+            # entry runs input_layernorm unconditionally.
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+            _ct_mark("input_ln")
 
         # Impl decides fusion per-forward. We mirror residual into impl's
         # persistent buffer unconditionally when fusion could run (full
@@ -624,56 +608,14 @@ def get_layer(prefix: str):
             config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
         )
 
-        # Phase F.1 cross-layer binding (always-on when MLP fusion is
-        # attached). Cheap module-ref attach; cute_phase_e_skip_input_layernorm
-        # opaque op needs this present even when β kernels are disabled,
-        # because the op call site fires whenever _cute_layer_name is set
-        # (gated by CUTE_MLP_FUSION, not CUTE_PHASE_E_FUSION). Without
-        # this attach, the op's non-skip branch raises fail-loud.
-        import os
-        layer_types = config.layer_types
-        num_layers = config.num_hidden_layers
-        for idx, layer in enumerate(self.layers):
-            if idx < self.start_layer or idx >= self.end_layer:
-                continue
-            if layer_types[idx] != "full_attention":
-                continue
-            attn = getattr(layer.self_attn, 'attn', None)
-            impl = getattr(attn, 'impl', None)
-            if impl is None or not hasattr(impl, 'attach_input_layernorm'):
-                continue
-            impl.attach_input_layernorm(
-                getattr(layer, 'input_layernorm', None)
-            )
-
-        # Phase E cross-layer binding (gated): every fusion-active
-        # (full_attention) decoder layer receives a ref to the NEXT decoder
-        # layer's input_layernorm module. Last layer (idx 63) passes None
-        # so the β kernel's ε epilogue omits the next-layer norm pull.
-        # ALSO allocates β kernel scratch buffers (heavy), so this stays
-        # gated by CUTE_PHASE_E_FUSION.
-        # Spec: docs/superpowers/specs/2026-04-22-unreal-kernel-phase-e-d25-design.md §5.3
-        if os.environ.get("CUTE_PHASE_E_FUSION", "0") == "1":
-            for idx, layer in enumerate(self.layers):
-                if idx < self.start_layer or idx >= self.end_layer:
-                    continue
-                if layer_types[idx] != "full_attention":
-                    continue
-                # impl lives on the inner Attention module, not on the
-                # Qwen3_5Attention wrapper: Qwen3_5Attention.attn is
-                # Attention, Attention.impl is CutePagedAttentionImpl.
-                # Existing pattern: see self_attn.attn.impl at L243, 361, 395.
-                attn = getattr(layer.self_attn, 'attn', None)
-                impl = getattr(attn, 'impl', None)
-                if impl is None or not hasattr(impl, 'attach_next_input_layernorm'):
-                    continue  # non-CuTe backend
-                # getattr tolerates PPMissingLayer (no input_layernorm attr)
-                next_norm = (
-                    getattr(self.layers[idx + 1], 'input_layernorm', None)
-                    if idx + 1 < num_layers
-                    else None
-                )
-                impl.attach_next_input_layernorm(next_norm)
+        # C1.5: Phase F.1 cross-layer binding loops (per-layer + next-layer
+        # LN bake) deleted. The skip-op they enabled (cute_phase_e_skip_*)
+        # was permanently retired in C1.5 along with β-coop's Phase 4
+        # epilogue — every layer now runs input_layernorm unconditionally
+        # at layer entry from Python (see Qwen3_5DecoderLayer.forward).
+        # The corresponding attach_*  methods on CutePagedAttentionImpl
+        # are commented-out (not deleted) in _backend.py per the
+        # comment-out-kernel-code rule.
 
         self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
             ["hidden_states", "residual"], config.hidden_size