diff --git a/benchmarks/nvllm/traces/cute_fusion/2026-04-17-own-the-stack/decode_log.txt b/benchmarks/nvllm/traces/cute_fusion/2026-04-17-own-the-stack/decode_log.txt
new file mode 100644
index 000000000000..e3690de1576f
--- /dev/null
+++ b/benchmarks/nvllm/traces/cute_fusion/2026-04-17-own-the-stack/decode_log.txt
@@ -0,0 +1,18 @@
+(EngineCore pid=144) INFO 04-17 17:23:02 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.3.self_attn.attn nat=1 phaseB  ref: absmax=220.6929 mean=5.4770e-02  kernel: absmax=220.6929 mean=5.4770e-02  diff: max=0.0000 mean=2.0900e-07  close=True
+(EngineCore pid=144) INFO 04-17 17:23:02 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.3.self_attn.attn nat=1 phaseC  hidden_ref_absmax=0.0000 hidden_kernel_absmax=0.0000 h_max_diff=0.0000  res_ref_absmax=170141183460469231731687303715884105728.0000 res_kernel_absmax=170141183460469231731687303715884105728.0000 r_max_diff=224.0000  close_h=True close_r=True
+(EngineCore pid=144) INFO 04-17 17:23:02 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.7.self_attn.attn nat=1 phaseB  ref: absmax=50.6131 mean=1.1355e-02  kernel: absmax=50.6131 mean=1.1355e-02  diff: max=0.0000 mean=1.4925e-07  close=True
+(EngineCore pid=144) INFO 04-17 17:23:02 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.7.self_attn.attn nat=1 phaseC  hidden_ref_absmax=65.5958 hidden_kernel_absmax=65.5000 h_max_diff=0.0958  res_ref_absmax=50.6131 res_kernel_absmax=50.5000 r_max_diff=0.1131  close_h=True close_r=True
+(EngineCore pid=144) INFO 04-17 17:23:02 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.11.self_attn.attn nat=1 phaseB  ref: absmax=39.3754 mean=4.9943e-04  kernel: absmax=39.3754 mean=4.9943e-04  diff: max=0.0000 mean=2.1335e-07  close=True
+(EngineCore pid=144) INFO 04-17 17:23:02 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.11.self_attn.attn nat=1 phaseC  hidden_ref_absmax=53.2364 hidden_kernel_absmax=53.2500 h_max_diff=0.0136  res_ref_absmax=39.3754 res_kernel_absmax=39.5000 r_max_diff=0.1246  close_h=True close_r=True
+(EngineCore pid=144) INFO 04-17 17:23:03 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.15.self_attn.attn nat=1 phaseB  ref: absmax=14.0381 mean=2.9821e-03  kernel: absmax=14.0381 mean=2.9821e-03  diff: max=0.0000 mean=1.3829e-07  close=True
+(EngineCore pid=144) INFO 04-17 17:23:03 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.15.self_attn.attn nat=1 phaseC  hidden_ref_absmax=36.7743 hidden_kernel_absmax=36.7500 h_max_diff=0.0243  res_ref_absmax=14.0381 res_kernel_absmax=14.0625 r_max_diff=0.0244  close_h=True close_r=True
+(EngineCore pid=144) INFO 04-17 17:23:03 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.19.self_attn.attn nat=1 phaseB  ref: absmax=2.4686 mean=6.3049e-04  kernel: absmax=2.4686 mean=6.3050e-04  diff: max=0.0000 mean=1.5019e-07  close=True
+(EngineCore pid=144) INFO 04-17 17:23:03 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.19.self_attn.attn nat=1 phaseC  hidden_ref_absmax=6.6036 hidden_kernel_absmax=6.5938 h_max_diff=0.0098  res_ref_absmax=2.4686 res_kernel_absmax=2.4688 r_max_diff=0.0039  close_h=True close_r=True
+(EngineCore pid=144) INFO 04-17 17:23:03 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.23.self_attn.attn nat=1 phaseB  ref: absmax=8.1704 mean=1.4569e-03  kernel: absmax=8.1704 mean=1.4569e-03  diff: max=0.0000 mean=1.8064e-07  close=True
+(EngineCore pid=144) INFO 04-17 17:23:03 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.23.self_attn.attn nat=1 phaseC  hidden_ref_absmax=16.0783 hidden_kernel_absmax=16.1250 h_max_diff=0.0467  res_ref_absmax=8.1704 res_kernel_absmax=8.1875 r_max_diff=0.0171  close_h=True close_r=True
+(EngineCore pid=144) INFO 04-17 17:23:03 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.27.self_attn.attn nat=1 phaseB  ref: absmax=3.0244 mean=2.7613e-04  kernel: absmax=3.0244 mean=2.7613e-04  diff: max=0.0000 mean=2.4245e-07  close=True
+(EngineCore pid=144) INFO 04-17 17:23:03 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.27.self_attn.attn nat=1 phaseC  hidden_ref_absmax=3.2705 hidden_kernel_absmax=3.2656 h_max_diff=0.0049  res_ref_absmax=3.0244 res_kernel_absmax=3.0312 r_max_diff=0.0068  close_h=True close_r=True
+(EngineCore pid=144) INFO 04-17 17:23:03 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.31.self_attn.attn nat=1 phaseB  ref: absmax=8.9649 mean=-4.0835e-03  kernel: absmax=8.9649 mean=-4.0835e-03  diff: max=0.0000 mean=2.2713e-07  close=True
+(EngineCore pid=144) INFO 04-17 17:23:03 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.31.self_attn.attn nat=1 phaseC  hidden_ref_absmax=12.0613 hidden_kernel_absmax=12.0625 h_max_diff=0.0037  res_ref_absmax=8.9649 res_kernel_absmax=8.9375 r_max_diff=0.0274  close_h=True close_r=True
+(EngineCore pid=144) INFO 04-17 17:23:03 [_backend.py:578] [CUTE_DEBUG_FUSION] layer=model.layers.35.self_attn.attn nat=1 phaseB  ref: absmax=4.5057 mean=7.0675e-04  kernel: absmax=4.5057 mean=7.0675e-04  diff: max=0.0000 mean=2.2241e-07  close=True
+(EngineCore pid=144) INFO 04-17 17:23:03 [_backend.py:607] [CUTE_DEBUG_FUSION] layer=model.layers.35.self_attn.attn nat=1 phaseC  hidden_ref_absmax=5.6348 hidden_kernel_absmax=5.6250 h_max_diff=0.0098  res_ref_absmax=4.5057 res_kernel_absmax=4.5000 r_max_diff=0.0057  close_h=True close_r=True
diff --git a/benchmarks/nvllm/traces/cute_fusion/2026-04-17-own-the-stack/summary.md b/benchmarks/nvllm/traces/cute_fusion/2026-04-17-own-the-stack/summary.md
new file mode 100644
index 000000000000..c9543ed25d43
--- /dev/null
+++ b/benchmarks/nvllm/traces/cute_fusion/2026-04-17-own-the-stack/summary.md
@@ -0,0 +1,60 @@
+# Own-the-stack Phase B Tier-3 evidence — 2026-04-17
+
+**Refactor branch:** `feat/own-the-stack-phase-b`
+**Model:** `natfii/Qwen3.5-27B-NVFP4-Opus-GB10`
+**Image:** `nvllm:gb10-ots`
+**Graph mode:** PIECEWISE CUDA graphs
+**Baseline commit (fusion-ship):** `37cceaa6c`
+
+## GSM8K result
+
+**8/8 (100%) — matches baseline.**
+
+Two runs, both 8/8:
+- Initial run without `CUTE_DEBUG_FUSION` — 8/8
+- Re-run with `CUTE_DEBUG_FUSION=1` for evidence capture — 8/8
+
+## Fusion engagement
+
+`decode_log.txt` — first 3 full-attention layers × 2 decode steps × phase B/C:
+
+- **Phase B (W_O GEMV):** kernel `absmax` / `mean` vs Python-dequant reference → `close=True` on every line.
+- **Phase C (residual + RMSNorm):** kernel `hidden` and `residual` outputs vs reference → `close_h=True close_r=True` on every line.
+
+Across the entire GSM8K run, 1920 decode steps engaged fusion; zero lines with `close=False` or `close_h=False` or `close_r=False`.
+
+Startup logged the new API firing for all 16 full-attention layers:
+
+```
+INFO [_backend.py:302] CuTe fusion attached: layer=model.layers.3 max_num_seqs=4 hidden_dim=5120 q_size=6144 attn_output_gate=True
+(... 16 such lines, layer indices 3 7 11 ... 63 ...)
+INFO [_backend.py:355] CuTe fusion resolved: layer=model.layers.3 wo_weight=[...] rmsnorm_gamma=[...]
+(... 16 such lines ...)
+```
+
+Confirms `CutePagedAttentionImpl.attach_fusion(parent_layer)` + `_resolve_fusion_weights()` replaced the old `_fusion_bind_callback` / `bind_fusion_weights` pair with no behavioral change.
+
+## Tier-1 jupyter tests (host-side)
+
+All 5 pass: `notebooks/nvllm/fusion_bind_tests.ipynb`.
+
+1. NVFP4 happy-path
+2. BF16 skip-path (CLAUDE.md debug protocol step 2 regression gate)
+3. Double-resolve rebinds to fresh tensor identity (C1 + C2)
+4. Buffer pointer stability across attach calls (H3)
+5. Per-forward gate boundary `num_actual_tokens > max_num_seqs` (A3)
+
+## How to reproduce
+
+```bash
+cd /home/natfii/docker/nvllm
+git checkout feat/own-the-stack-phase-b
+docker build -f docker/Dockerfile.gb10 -t nvllm:gb10-ots .
+NVLLM_IMAGE=nvllm:gb10-ots CUTE_DEBUG_FUSION=1 ./scripts/serve-cute.sh
+.venv/bin/python scripts/gsm8k_sanity.py
+```
+
+## Rollback
+
+Single-commit refactor. `git revert HEAD` returns to `37cceaa6c`.
+Docker image snapshot `nvllm:gb10-preshim-20260417` tags the pre-refactor build.
diff --git a/benchmarks/nvllm/traces/cute_fusion/2026-04-17-phase-bc-fused/profiles/fused.pt.trace.json.gz b/benchmarks/nvllm/traces/cute_fusion/2026-04-17-phase-bc-fused/profiles/fused.pt.trace.json.gz
new file mode 100644
index 000000000000..b086a2cb591a
Binary files /dev/null and b/benchmarks/nvllm/traces/cute_fusion/2026-04-17-phase-bc-fused/profiles/fused.pt.trace.json.gz differ
diff --git a/benchmarks/nvllm/traces/cute_fusion/2026-04-17-phase-bc-fused/profiles/profiler_out_0.txt b/benchmarks/nvllm/traces/cute_fusion/2026-04-17-phase-bc-fused/profiles/profiler_out_0.txt
new file mode 100644
index 000000000000..b0079e9152de
--- /dev/null
+++ b/benchmarks/nvllm/traces/cute_fusion/2026-04-17-phase-bc-fused/profiles/profiler_out_0.txt
@@ -0,0 +1,107 @@
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+                   execute_context_0(0)_generation_4(4)         0.00%       0.000us         0.00%       0.000us       0.000us       10.561s       101.00%       10.561s      83.821ms           126  
+_ZN7cutlass13device_kernelINS_4gemm6kernel13GemmUniv...         0.00%       0.000us         0.00%       0.000us       0.000us        7.939s        75.92%        7.939s     205.625us         38608  
+                                               aten::mm         0.11%      11.736ms         0.12%      12.885ms     101.460us        1.263s        12.07%        1.263s       9.942ms           127  
+void cutlass::Kernel2<cutlass_80_wmma_tensorop_bf16_...         0.00%       0.000us         0.00%       0.000us       0.000us        1.263s        12.07%        1.263s       9.942ms           127  
+                    vllm::unified_attention_with_output         3.36%     358.547ms         3.82%     407.589ms     200.585us     865.392ms         8.28%     868.537ms     427.430us          2032  
+kernel_cutlass__kernel_vllmv1attentionbackendscute_p...         0.00%       0.000us         0.00%       0.000us       0.000us     865.392ms         8.28%     865.392ms     425.882us          2032  
+                               vllm::gdn_attention_core         5.49%     585.842ms         6.94%     740.475ms     121.469us     199.608ms         1.91%     199.803ms      32.776us          6096  
+fused_recurrent_gated_delta_rule_packed_decode_kerne...         0.00%       0.000us         0.00%       0.000us       0.000us     178.298ms         1.71%     178.298ms      29.248us          6096  
+                   execute_context_0(0)_generation_2(2)         0.00%       0.000us         0.00%       0.000us       0.000us      84.606ms         0.81%      84.606ms      84.606ms             1  
+void vllm::cvt_fp16_to_fp4<__nv_bfloat16, false>(int...         0.00%       0.000us         0.00%       0.000us       0.000us      64.845ms         0.62%      64.845ms       2.127us         30480  
+                                        cudaGraphLaunch         3.95%     421.166ms         4.04%     430.778ms      52.184us      37.815ms         0.36%      37.816ms       4.581us          8255  
+                                  cudaStreamIsCapturing         0.02%       2.654ms         0.02%       2.654ms       0.264us      37.738ms         0.36%      37.738ms       3.760us         10037  
+                           _causal_conv1d_update_kernel         0.00%       0.000us         0.00%       0.000us       0.000us      21.310ms         0.20%      21.310ms       3.496us          6096  
+void vllm::silu_mul_cvt_fp16_to_fp4<__nv_bfloat16, f...         0.00%       0.000us         0.00%       0.000us       0.000us      13.059ms         0.12%      13.059ms       1.607us          8128  
+triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_sca...         0.00%       0.000us         0.00%       0.000us       0.000us      10.933ms         0.10%      10.933ms       1.793us          6096  
+triton_red_fused__to_copy_add_copy__mean_mul_pow_rsq...         0.00%       0.000us         0.00%       0.000us       0.000us       8.987ms         0.09%       8.987ms       2.211us          4064  
+triton_poi_fused__to_copy__unsafe_view_add_clone_mea...         0.00%       0.000us         0.00%       0.000us       0.000us       7.504ms         0.07%       7.504ms       1.231us          6096  
+                                     triton_poi_fused_0         0.00%       0.000us         0.00%       0.000us       0.000us       7.166ms         0.07%       7.166ms       0.868us          8255  
+                                        Memset (Device)         0.00%       0.000us         0.00%       0.000us       0.000us       6.398ms         0.06%       6.398ms       0.394us         16255  
+                                     triton_poi_fused_4         0.00%       0.000us         0.00%       0.000us       0.000us       6.241ms         0.06%       6.241ms       1.024us          6096  
+                                     triton_per_fused_1         0.00%       0.000us         0.00%       0.000us       0.000us       5.460ms         0.05%       5.460ms       0.896us          6096  
+                                            aten::copy_         0.19%      20.221ms         0.54%      57.720ms      12.622us       5.307ms         0.05%       5.307ms       1.161us          4573  
+                  _C_cache_ops::reshape_and_cache_flash         0.08%       8.675ms         0.18%      19.099ms       9.399us       5.212ms         0.05%       5.212ms       2.565us          2032  
+void vllm::reshape_and_cache_flash_kernel<__nv_bfloa...         0.00%       0.000us         0.00%       0.000us       0.000us       5.212ms         0.05%       5.212ms       2.565us          2032  
+triton_red_fused__to_copy_add_copy__mean_mul_pow_rsq...         0.00%       0.000us         0.00%       0.000us       0.000us       5.011ms         0.05%       5.011ms       2.466us          2032  
+triton_red_fused__to_copy_add_copy__mean_mul_pow_rsq...         0.00%       0.000us         0.00%       0.000us       0.000us       4.978ms         0.05%       4.978ms       2.450us          2032  
+                                     triton_poi_fused_6         0.00%       0.000us         0.00%       0.000us       0.000us       4.688ms         0.04%       4.688ms       0.769us          6096  
+triton_poi_fused__to_copy_add_cat_clone_index_select...         0.00%       0.000us         0.00%       0.000us       0.000us       4.044ms         0.04%       4.044ms       1.990us          2032  
+triton_red_fused__to_copy_add_mean_mul_pow_rsqrt_sca...         0.00%       0.000us         0.00%       0.000us       0.000us       3.978ms         0.04%       3.978ms       1.958us          2032  
+                           _compute_slot_mapping_kernel         0.00%       0.000us         0.00%       0.000us       0.000us       3.691ms         0.04%       3.691ms       7.265us           508  
+triton_poi_fused__to_copy_add_cat_index_select_mean_...         0.00%       0.000us         0.00%       0.000us       0.000us       3.511ms         0.03%       3.511ms       1.728us          2032  
+                               triton_poi_fused_zeros_7         0.00%       0.000us         0.00%       0.000us       0.000us       2.873ms         0.03%       2.873ms       0.707us          4064  
+triton_poi_fused__to_copy_add_cat_mean_mul_pow_rsqrt...         0.00%       0.000us         0.00%       0.000us       0.000us       2.617ms         0.03%       2.617ms       1.288us          2032  
+triton_poi_fused__to_copy_add_cat_clone_mean_mul_pow...         0.00%       0.000us         0.00%       0.000us       0.000us       2.324ms         0.02%       2.324ms       1.144us          2032  
+                                     triton_poi_fused_3         0.00%       0.000us         0.00%       0.000us       0.000us       2.083ms         0.02%       2.083ms       1.025us          2032  
+                         Memcpy DtoD (Device -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       2.079ms         0.02%       2.079ms       0.862us          2412  
+                                     triton_red_fused_7         0.00%       0.000us         0.00%       0.000us       0.000us       2.053ms         0.02%       2.053ms       1.010us          2032  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us       1.528ms         0.01%       1.528ms      12.032us           127  
+                                     triton_poi_fused_5         0.00%       0.000us         0.00%       0.000us       0.000us       1.487ms         0.01%       1.487ms       0.781us          1905  
+                                     triton_poi_fused_1         0.00%       0.000us         0.00%       0.000us       0.000us       1.395ms         0.01%       1.395ms       0.646us          2159  
+                               triton_poi_fused_zeros_6         0.00%       0.000us         0.00%       0.000us       0.000us       1.354ms         0.01%       1.354ms       0.711us          1905  
+                                       cuLaunchKernelEx         0.54%      57.745ms         0.54%      57.872ms       4.557us       1.076ms         0.01%       1.110ms       0.087us         12700  
+                         Memcpy HtoD (Pinned -> Device)         0.00%       0.000us         0.00%       0.000us       0.000us       1.046ms         0.01%       1.046ms       0.822us          1272  
+                                           aten::argmax         0.02%       2.274ms         0.05%       5.450ms      42.911us       1.025ms         0.01%       1.025ms       8.067us           127  
+void at::native::reduce_kernel<512, 1, at::native::R...         0.00%       0.000us         0.00%       0.000us       0.000us     974.773us         0.01%     974.773us       7.675us           127  
+                                           Buffer Flush         0.02%       2.016ms         0.02%       2.033ms     156.377us     962.422us         0.01%     962.422us      74.032us            13  
+                                              aten::sub         0.05%       5.328ms         0.10%      10.489ms      10.324us     939.062us         0.01%     939.062us       0.924us          1016  
+                                     triton_red_fused_2         0.00%       0.000us         0.00%       0.000us       0.000us     731.499us         0.01%     731.499us       5.760us           127  
+                                Activity Buffer Request         0.20%      21.094ms         0.20%      21.140ms       1.409ms     721.289us         0.01%     721.289us      48.086us            15  
+                                        cudaEventRecord         0.00%     529.852us         0.00%     529.852us       2.070us     680.236us         0.01%     680.236us       2.657us           256  
+                                    cudaStreamWaitEvent         0.00%     313.759us         0.00%     313.759us       2.471us     616.016us         0.01%     616.016us       4.851us           127  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     609.384us         0.01%     609.384us       1.200us           508  
+                                            aten::index         0.08%       8.435ms         0.14%      15.196ms      59.591us     502.135us         0.00%     767.139us       3.008us           255  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     434.562us         0.00%     434.562us       1.711us           254  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     381.222us         0.00%     381.222us       1.001us           381  
+void at::native::index_elementwise_kernel<128, 4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     324.996us         0.00%     324.996us       2.539us           128  
+                                              aten::add         0.04%       4.085ms         0.07%       6.939ms      18.212us     318.789us         0.00%     318.789us       0.837us           381  
+                                   cudaEventSynchronize        77.84%        8.301s        98.85%       10.541s      41.175ms     186.256us         0.00%        2.404s       9.392ms           256  
+void at::native::vectorized_gather_kernel<16, long>(...         0.00%       0.000us         0.00%       0.000us       0.000us     177.139us         0.00%     177.139us       1.395us           127  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     142.933us         0.00%     142.933us       1.125us           127  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us     141.108us         0.00%     141.108us       1.111us           127  
+void at::native::unrolled_elementwise_kernel<at::nat...         0.00%       0.000us         0.00%       0.000us       0.000us     126.137us         0.00%     126.137us       0.993us           127  
+                               triton_poi_fused_zeros_3         0.00%       0.000us         0.00%       0.000us       0.000us     121.170us         0.00%     121.170us       0.954us           127  
+                               cudaEventRecordWithFlags         0.00%     264.411us         0.00%     264.411us       1.033us     113.016us         0.00%     113.016us       0.441us           256  
+                                            aten::fill_         0.04%       4.166ms         0.11%      11.220ms       7.362us     109.173us         0.00%     109.173us       0.072us          1524  
+void at::native::vectorized_elementwise_kernel<4, at...         0.00%       0.000us         0.00%       0.000us       0.000us     108.373us         0.00%     108.373us       0.853us           127  
+                         Memcpy DtoH (Device -> Pinned)         0.00%       0.000us         0.00%       0.000us       0.000us      76.631us         0.00%      76.631us       0.603us           127  
+                                       Memset (Unknown)         0.00%       0.000us         0.00%       0.000us       0.000us      41.084us         0.00%      41.084us       0.321us           128  
+                                  Lazy Function Loading         0.00%      55.212us         0.00%      55.212us      55.212us      33.885us         0.00%      33.885us      33.885us             1  
+                                         aten::scatter_         0.01%       1.543ms         0.02%       1.620ms       1.620ms       2.016us         0.00%       2.016us       2.016us             1  
+void at::native::_scatter_gather_elementwise_kernel<...         0.00%       0.000us         0.00%       0.000us       0.000us       2.016us         0.00%       2.016us       2.016us             1  
+void at::native::vectorized_elementwise_kernel<2, at...         0.00%       0.000us         0.00%       0.000us       0.000us       0.800us         0.00%       0.800us       0.800us             1  
+                   execute_context_0(0)_generation_4(4)         4.51%     481.258ms        21.66%        2.310s      18.335ms       0.000us         0.00%        2.380s      18.888ms           126  
+                                            aten::slice         0.54%      57.437ms         0.71%      76.173ms       1.550us       0.000us         0.00%       0.000us       0.000us         49155  
+                                       aten::as_strided         0.26%      27.670ms         0.26%      27.670ms       0.351us       0.000us         0.00%       0.000us       0.000us         78876  
+                                        cudaMemcpyAsync         0.30%      31.957ms         0.30%      31.957ms       8.385us       0.000us         0.00%       0.000us       0.000us          3811  
+                                       aten::lift_fresh         0.00%     137.227us         0.00%     137.227us       0.536us       0.000us         0.00%       0.000us       0.000us           256  
+                                          aten::flatten         0.01%     873.511us         0.01%       1.344ms      10.580us       0.000us         0.00%       0.000us       0.000us           127  
+                                             aten::view         0.07%       7.503ms         0.07%       7.503ms       0.703us       0.000us         0.00%       0.000us       0.000us         10669  
+                                     aten::index_select         0.01%       1.179ms         0.01%       1.179ms       9.280us       0.000us         0.00%       0.000us       0.000us           127  
+                                           aten::detach         0.01%     873.870us         0.01%     873.870us       3.440us       0.000us         0.00%       0.000us       0.000us           254  
+                                               aten::to         0.03%       3.427ms         0.20%      21.353ms       1.586us       0.000us         0.00%       2.184ms       0.162us         13466  
+                                     aten::resolve_conj         0.00%      88.940us         0.00%      88.940us       0.350us       0.000us         0.00%       0.000us       0.000us           254  
+                                      aten::resolve_neg         0.00%      32.045us         0.00%      32.045us       0.126us       0.000us         0.00%       0.000us       0.000us           254  
+                                       cudaLaunchKernel         0.33%      34.911ms         0.33%      34.911ms       8.324us       0.000us         0.00%       0.000us       0.000us          4194  
+                                          aten::reshape         0.00%     319.108us         0.01%     785.756us       3.081us       0.000us         0.00%       0.000us       0.000us           255  
+                                         aten::_to_copy         0.02%       2.594ms         0.17%      17.925ms      20.118us       0.000us         0.00%       2.184ms       2.451us           891  
+                                    aten::empty_strided         0.09%       9.398ms         0.09%      10.068ms       3.444us       0.000us         0.00%       0.000us       0.000us          2923  
+                                           aten::select         0.08%       8.643ms         0.10%      10.181ms       2.227us       0.000us         0.00%       0.000us       0.000us          4572  
+                                               aten::lt         0.01%       1.047ms         0.01%       1.047ms       8.247us       0.000us         0.00%       0.000us       0.000us           127  
+                                               aten::eq         0.01%       1.149ms         0.03%       2.777ms      21.868us       0.000us         0.00%       0.000us       0.000us           127  
+                                              aten::sum         0.02%       1.970ms         0.03%       2.845ms      22.403us       0.000us         0.00%       0.000us       0.000us           127  
+                                             aten::item         0.00%     432.115us         0.01%     732.264us       5.766us       0.000us         0.00%       0.000us       0.000us           127  
+                              aten::_local_scalar_dense         0.00%     300.149us         0.00%     300.149us       2.363us       0.000us         0.00%       0.000us       0.000us           127  
+                                      Pregraph bytecode         0.70%      74.798ms         0.70%      74.798ms     588.961us       0.000us         0.00%       0.000us       0.000us           127  
+                                        aten::transpose         0.14%      15.292ms         0.18%      18.920ms       3.040us       0.000us         0.00%       0.000us       0.000us          6223  
+                                        aten::unsqueeze         0.10%      11.048ms         0.12%      13.311ms       1.080us       0.000us         0.00%       0.000us       0.000us         12319  
+                                          aten::squeeze         0.05%       5.677ms         0.06%       6.668ms       1.094us       0.000us         0.00%       0.000us       0.000us          6096  
+                          vllm::unified_kv_cache_update         0.35%      37.728ms         0.66%      69.972ms      34.435us       0.000us         0.00%       5.212ms       2.565us          2032  
+                                            aten::empty         0.04%       4.200ms         0.04%       4.296ms       2.112us       0.000us         0.00%       0.000us       0.000us          2034  
+-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
+Self CPU time total: 10.664s
+Self CUDA time total: 10.457s
+
diff --git a/benchmarks/nvllm/traces/cute_fusion/2026-04-17-phase-bc-fused/summary.md b/benchmarks/nvllm/traces/cute_fusion/2026-04-17-phase-bc-fused/summary.md
new file mode 100644
index 000000000000..11caa5b2266a
--- /dev/null
+++ b/benchmarks/nvllm/traces/cute_fusion/2026-04-17-phase-bc-fused/summary.md
@@ -0,0 +1,111 @@
+# Profile: cute_fusion — Phase B/C fusion under PIECEWISE CUDA graphs
+
+**Commit:** `37cceaa6c` — [fix(cute-paged): fix fusion Phase C gibberish via per-CTA arrival counter](https://github.com/Navi-AI-Lab/nvllm/commit/37cceaa6c199bf211a2e170c414f64bf654b0f45)
+**Date:** 2026-04-17
+**Model:** `natfii/Qwen3.5-27B-NVFP4-Opus-GB10` (27B dense, 48 layers, 12 `full_attention` / 36 `linear_attention`)
+**Hardware:** NVIDIA DGX Spark — GB10, SM121
+**Config:**
+- `--attention-backend CUTE_PAGED`
+- `--kv-cache-dtype fp8_e4m3`
+- `--compilation-config '{"cudagraph_mode":"PIECEWISE"}'` (no `--enforce-eager`)
+- `--max-num-seqs 4`, `--max-model-len 65536`
+- Fusion: **ON** — Phase A (attention + gate sigmoid) + Phase B (W_O NVFP4 GEMV) + Phase C (residual add + RMSNorm) all in one CuTe uber-kernel
+
+**Profiler:** vLLM built-in torch profiler (`--profiler-config profiler=torch, ignore_frontend=true, delay_iterations=3, active_iterations=30`)
+
+**Workload:** 4 concurrent `/v1/completions` requests × 128 tokens each, `ignore_eos=true`, temperature=0. Steady-state decode after a 2-request warmup.
+
+## Top 15 kernels by total GPU time (fused path)
+
+| # | Kernel | Calls | Total | Mean | % GPU |
+|---|---|---|---|---|---|
+| 1 | `cutlass::device_kernel<gemm::GemmUniv...>` (NVFP4 CUTLASS) | 38,608 | 7.939 s | 205.625 μs | **75.92%** |
+| 2 | `aten::mm` / `cutlass_80_wmma_tensorop_bf16` (BF16 embed + lm_head) | 127 | 1.263 s | 9.942 ms | 12.07% |
+| 3 | **`kernel_cutlass__kernel_vllmv1attentionbackendscute_paged`** (CuTe uber-kernel — A+B+C fused) | **2,032** | **865.392 ms** | **425.882 μs** | **8.28%** |
+| 4 | `vllm::gdn_attention_core` (mamba linear attention) | 6,096 | 199.608 ms | 32.776 μs | 1.91% |
+| 5 | `fused_recurrent_gated_delta_rule_packed_decode_kernel` | 6,096 | 178.298 ms | 29.248 μs | 1.71% |
+| 6 | `vllm::cvt_fp16_to_fp4<bfloat16>` (activation quant) | 30,480 | 64.845 ms | 2.127 μs | 0.62% |
+| 7 | `cudaGraphLaunch` | 8,255 | 37.815 ms | 4.581 μs | 0.36% |
+| 8 | `cudaStreamIsCapturing` | 10,037 | 37.738 ms | 3.760 μs | 0.36% |
+| 9 | `_causal_conv1d_update_kernel` | 6,096 | 21.310 ms | 3.496 μs | 0.20% |
+| 10 | `vllm::silu_mul_cvt_fp16_to_fp4` (SiLU+mul+quant) | 8,128 | 13.059 ms | 1.607 μs | 0.12% |
+| 11 | `triton_red_fused__to_copy_add_mean_mul_pow_rsqrt` (RMSNorm) | 6,096 | 10.933 ms | 1.793 μs | 0.10% |
+| 12 | `triton_red_fused__to_copy_add_copy__mean_mul_pow_rsqrt` (RMSNorm) | 4,064 | 8.987 ms | 2.211 μs | 0.09% |
+| 13 | `triton_poi_fused__to_copy__unsafe_view_add_clone_mean` | 6,096 | 7.504 ms | 1.231 μs | 0.07% |
+| 14 | `triton_poi_fused_0` | 8,255 | 7.166 ms | 0.868 μs | 0.07% |
+| 15 | `Memset (Device)` | 16,255 | 6.398 ms | 0.394 μs | 0.06% |
+
+## Read
+
+- **NVFP4 CUTLASS GEMM (76%)** dominates the decode hot path — expected on a dense 27B in FP4. This is the target of all future perf work (stream-K tuning, persistent kernel, better tile schedulers). The CuTe fusion kernel is no longer the bottleneck; the FFN is.
+- **CuTe fused A+B+C kernel (8.28% of GPU time)** runs at 425.9 μs/call. This call now subsumes three previously separate ops:
+  - Phase A — attention (previously ~244 μs standalone, per the April 13 CuTe baseline memory)
+  - Phase B — W_O NVFP4 GEMV (previously a separate `cutlass::device_kernel` call in top-1)
+  - Phase C — residual add + post-attn RMSNorm (previously two Triton kernels)
+- **Triton RMSNorm kernels (items 11, 12, 13)** are still present for the other RMSNorms in the layer (pre-attn norm, pre-MoE norm, post-MoE norm). Only the post-attention RMSNorm is fused into CuTe.
+- Attention + its fused epilogue now occupies **8.28%** of decode GPU time, up from ~0.5% unfused — because the fused kernel absorbed work that used to show up as 2-3 separate kernels. Net effect: same total work, fewer kernel launches, fewer materializations of the attention output tensor.
+
+## CUDA graph / sync overhead (items 7–8)
+
+`cudaGraphLaunch` + `cudaStreamIsCapturing` = **75.6 ms / 0.72% of GPU time** across **18,292 host-side calls**. This is the PIECEWISE capture/replay bookkeeping — ~9 μs per decode step to re-enter + replay a captured subgraph. Small absolute cost but high call count; if we ever migrate to FULL or FULL_AND_PIECEWISE graphs, these collapse into one launch per step. Not an action item right now — just a known overhead signature worth tracking.
+
+## Throughput (measured during profile capture)
+
+| Scenario | Wall | Tokens | tok/s aggregate |
+|---|---|---|---|
+| batch=4 × 128 tok (profiled) | 12.47 s | 512 | 41.1 |
+| batch=4 × 256 tok (pre-profile, GSM8K session) | 23.48 s | 1024 | 43.6 |
+| batch=1 × 256 tok (pre-profile) | 22.91 s | 256 | 11.2 |
+
+Profiler overhead during capture: ~5% (43.6 → 41.1 tok/s).
+
+## Caveats
+
+- This profile captures **fusion-on**. A direct unfused baseline is not in this trace; the next step is a second capture with `Qwen3NextDecoderLayer._fusion_bound` forced to False (or by launching with `fusion_active` gated off) to quantify the exact fusion delta.
+- `execute_context_0(0)_generation_4(4)` at 10.56 s (101%) is the NVTX range wrapping all 4 generation requests — double-counted under torch-profiler accounting, not a real kernel.
+- `ignore_eos=true` means the decode runs the full 128 tokens with no early-exit; closer to benchmark conditions than natural chat.
+
+## How to reproduce
+
+```bash
+# 1. Build image with fusion fix baked in
+docker build -f docker/Dockerfile.gb10 -t nvllm:gb10 .
+
+# 2. Launch with torch profiler
+mkdir -p benchmarks/nvllm/traces/cute_fusion/2026-04-17-phase-bc-fused/profiles
+docker run -d --name nvllm --gpus all --ipc=host --network host --privileged \
+  -v "$HOME/.cache/huggingface:/root/.cache/huggingface" \
+  -v "$HOME/.cache/flashinfer:/root/.cache/flashinfer" \
+  -v "$PWD/benchmarks/nvllm/traces/cute_fusion/2026-04-17-phase-bc-fused/profiles:/tmp/profiles" \
+  -e VLLM_NVFP4_GEMM_BACKEND=cutlass \
+  -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
+  -e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+  nvllm:gb10 serve \
+  --model natfii/Qwen3.5-27B-NVFP4-Opus-GB10 --served-model-name default \
+  --host 0.0.0.0 --port 8000 --kv-cache-dtype fp8_e4m3 \
+  --attention-backend CUTE_PAGED --max-model-len 65536 --max-num-seqs 4 \
+  --language-model-only --mamba-cache-mode align --trust-remote-code \
+  --gpu-memory-utilization 0.80 --max-num-batched-tokens 65536 \
+  --compilation-config '{"cudagraph_mode":"PIECEWISE"}' \
+  --profiler-config '{"profiler":"torch","torch_profiler_dir":"/tmp/profiles","ignore_frontend":true,"delay_iterations":3,"active_iterations":30,"torch_profiler_with_stack":false,"torch_profiler_use_gzip":true}'
+
+# 3. Wait for "Application startup complete", warm up 2 requests, then:
+curl -X POST http://localhost:8000/start_profile
+
+# 4. Fire 4 concurrent decode requests (128 tokens each, ignore_eos)
+for i in 1 2 3 4; do
+  curl -s http://localhost:8000/v1/completions \
+    -H 'Content-Type: application/json' \
+    -d "{\"model\":\"default\",\"prompt\":\"<128-token prompt>\",\"max_tokens\":128,\"temperature\":0,\"ignore_eos\":true}" &
+done; wait
+
+curl -X POST http://localhost:8000/stop_profile
+
+# 5. Trace lands in ./benchmarks/nvllm/traces/cute_fusion/2026-04-17-phase-bc-fused/profiles/
+#    View with chrome://tracing or https://ui.perfetto.dev
+```
+
+## Files
+
+- `profiles/fused.pt.trace.json.gz` — 11.5 MB torch profiler trace (Chrome Tracing / Perfetto compatible)
+- `profiles/profiler_out_0.txt` — 21 KB human-readable kernel summary
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
index 9414f5af3722..fb66715ce412 100644
--- a/cmake/external_projects/vllm_flash_attn.cmake
+++ b/cmake/external_projects/vllm_flash_attn.cmake
@@ -39,7 +39,7 @@ else()
   FetchContent_Declare(
           vllm-flash-attn
           GIT_REPOSITORY https://github.com/vllm-project/flash-attention.git
-          GIT_TAG c0ec424fd8a546d0cbbf4bf050bbcfe837c55afb
+          GIT_TAG f5bc33cfc02c744d24a2e9d50e6db656de40611c
           GIT_PROGRESS TRUE
           # Don't share the vllm-flash-attn build between build types
           BINARY_DIR ${CMAKE_BINARY_DIR}/vllm-flash-attn
diff --git a/docs/design/attention_backends.md b/docs/design/attention_backends.md
index 242cc6b3b1ed..a9aa7cd45e74 100644
--- a/docs/design/attention_backends.md
+++ b/docs/design/attention_backends.md
@@ -107,15 +107,6 @@ Priority is **1 = highest** (tried first).
 | 3 | `TRITON_ATTN` |
 | 4 | `FLEX_ATTENTION` |
 
-**Ampere/Hopper (SM 8.x-9.x):**
-
-| Priority | Backend |
-| -------- | ------- |
-| 1 | `FLASH_ATTN` |
-| 2 | `FLASHINFER` |
-| 3 | `TRITON_ATTN` |
-| 4 | `FLEX_ATTENTION` |
-
 ### MLA Attention (DeepSeek-style)
 
 **Blackwell (SM 10.x):**
@@ -177,7 +168,7 @@ Priority is **1 = highest** (tried first).
 | `ROCM_AITER_UNIFIED_ATTN` | | fp16, bf16 | `auto` | %16 | Any | ✅ | ✅ | ❌ | All | N/A |
 | `ROCM_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2` | %16 | 32, 64, 80, 96, 128, 160, 192, 224, 256 | ❌ | ✅ | ❌ | Decoder, Encoder, Encoder Only | N/A |
 | `TREE_ATTN` | | fp16, bf16 | `auto`, `float16`, `bfloat16` | %16 | 32, 64, 96, 128, 160, 192, 224, 256 | ❌ | ❌ | ❌ | Decoder | Any |
-| `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2`, `int8_per_token_head`, `fp8_per_token_head` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
+| `TRITON_ATTN` | | fp16, bf16, fp32 | `auto`, `float16`, `bfloat16`, `fp8`, `fp8_e4m3`, `fp8_e5m2`, `int8_per_token_head`, `fp8_per_token_head`, `turboquant25`, `turboquant35` | %16 | Any | ✅ | ✅ | ❌ | All | Any |
 
 > **†** FlashInfer uses TRTLLM attention on Blackwell (SM100), which supports sinks. Disable via `--attention-config.use_trtllm_attention=0`.
 >
diff --git a/notebooks/nvllm/fusion_bind_tests.ipynb b/notebooks/nvllm/fusion_bind_tests.ipynb
new file mode 100644
index 000000000000..04fd1316d13b
--- /dev/null
+++ b/notebooks/nvllm/fusion_bind_tests.ipynb
@@ -0,0 +1,335 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "7fb27b941602401d91542211134fc71a",
+   "metadata": {},
+   "source": [
+    "# Fusion-bind Tier-1 tests\n",
+    "\n",
+    "Five cases from `docs/superpowers/specs/2026-04-17-own-the-stack-design.md` Tier 1.\n",
+    "Run order matters - the helpers cell defines `fake_impl`,\n",
+    "`nvfp4_o_proj`, etc. used by all tests."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "acae54e37e7d407bbb7b55eff062a284",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "\n",
+    "sys.path.insert(0, \"/home/natfii/docker/nvllm\")\n",
+    "from unittest.mock import MagicMock, patch\n",
+    "\n",
+    "import torch\n",
+    "\n",
+    "from vllm.v1.attention.backends.cute_paged._backend import (\n",
+    "    CutePagedAttentionImpl,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def fake_impl():\n",
+    "    impl = CutePagedAttentionImpl.__new__(CutePagedAttentionImpl)\n",
+    "    impl.num_heads = 24\n",
+    "    impl.head_size = 256\n",
+    "    impl.num_kv_heads = 4\n",
+    "    impl.scale = 1.0 / (256**0.5)\n",
+    "    impl.kv_cache_dtype = \"auto\"\n",
+    "    impl.num_queries_per_kv = 6\n",
+    "    impl.kv_sharing_target_layer_name = None\n",
+    "    impl.alibi_slopes = None\n",
+    "    impl.sliding_window = None\n",
+    "    impl.logits_soft_cap = None\n",
+    "    impl._fusion_bound = False\n",
+    "    impl._fusion_active = False\n",
+    "    impl._fusion_attached = False\n",
+    "    return impl\n",
+    "\n",
+    "\n",
+    "def nvfp4_o_proj(hidden_dim=5120, q_size=6144, device=\"cpu\"):\n",
+    "    mod = MagicMock()\n",
+    "    # NVFP4 packed weight is uint8 \u2014 not differentiable, so use plain tensor.\n",
+    "    mod.weight = torch.zeros(hidden_dim, q_size // 2, dtype=torch.uint8, device=device)\n",
+    "    # FP8 block scales likewise cannot be nn.Parameter (no grad support for fp8).\n",
+    "    mod.weight_scale = torch.zeros(\n",
+    "        hidden_dim, q_size // 16, dtype=torch.float8_e4m3fn, device=device\n",
+    "    )\n",
+    "    # Global scale IS fp32 and IS a Parameter in the real quant module.\n",
+    "    mod.weight_global_scale = torch.nn.Parameter(\n",
+    "        torch.tensor([1.0], dtype=torch.float32, device=device)\n",
+    "    )\n",
+    "    return mod\n",
+    "\n",
+    "\n",
+    "def bf16_o_proj(hidden_dim=5120, q_size=6144, device=\"cpu\"):\n",
+    "    mod = MagicMock(spec=[\"weight\"])\n",
+    "    mod.weight = torch.nn.Parameter(\n",
+    "        torch.zeros(hidden_dim, q_size, dtype=torch.bfloat16, device=device)\n",
+    "    )\n",
+    "    return mod\n",
+    "\n",
+    "\n",
+    "def post_norm(hidden_dim=5120, device=\"cpu\"):\n",
+    "    mod = MagicMock()\n",
+    "    mod.weight = torch.nn.Parameter(\n",
+    "        torch.ones(hidden_dim, dtype=torch.bfloat16, device=device)\n",
+    "    )\n",
+    "    mod.variance_epsilon = 1e-6\n",
+    "    return mod\n",
+    "\n",
+    "\n",
+    "def parent_layer(\n",
+    "    o_proj,\n",
+    "    post_norm_mod,\n",
+    "    prefix=\"model.layers.0\",\n",
+    "    attn_output_gate=True,\n",
+    "    hidden_size=5120,\n",
+    "    num_heads=24,\n",
+    "    head_dim=256,\n",
+    "):\n",
+    "    layer = MagicMock()\n",
+    "    layer.prefix = prefix\n",
+    "    layer.self_attn = MagicMock()\n",
+    "    layer.self_attn.o_proj = o_proj\n",
+    "    layer.self_attn.num_heads = num_heads\n",
+    "    layer.self_attn.head_dim = head_dim\n",
+    "    layer.self_attn.hidden_size = hidden_size\n",
+    "    layer.self_attn.attn_output_gate = attn_output_gate\n",
+    "    layer.post_attention_layernorm = post_norm_mod\n",
+    "    return layer\n",
+    "\n",
+    "\n",
+    "def mock_cfg(max_num_seqs=16):\n",
+    "    cfg = MagicMock()\n",
+    "    cfg.scheduler_config.max_num_seqs = max_num_seqs\n",
+    "    return cfg\n",
+    "\n",
+    "\n",
+    "# Buffer allocation in attach_fusion uses 'cuda' device by default. Patch to\n",
+    "# 'cpu' so these tests run on host without a GPU context.\n",
+    "_orig_prealloc = CutePagedAttentionImpl._preallocate_fusion_buffers\n",
+    "\n",
+    "\n",
+    "def _cpu_prealloc(self, max_num_seqs, hidden_dim, q_size, device):\n",
+    "    return _orig_prealloc(self, max_num_seqs, hidden_dim, q_size, \"cpu\")\n",
+    "\n",
+    "\n",
+    "print(\"test helpers ready\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "9a63283cbaf04dbcab1f6479b197f3a8",
+   "metadata": {},
+   "source": [
+    "## Test 1 \u2014 NVFP4 happy-path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8dd0d8092fe74a7c96281538738b07e2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "impl = fake_impl()\n",
+    "o = nvfp4_o_proj()\n",
+    "n = post_norm()\n",
+    "p = parent_layer(o, n)\n",
+    "\n",
+    "with (\n",
+    "    patch(\"vllm.config.get_current_vllm_config\", return_value=mock_cfg(16)),\n",
+    "    patch.object(CutePagedAttentionImpl, \"_preallocate_fusion_buffers\", _cpu_prealloc),\n",
+    "):\n",
+    "    impl.attach_fusion(p)\n",
+    "\n",
+    "assert impl._fusion_attached is True\n",
+    "assert impl._fusion_max_num_seqs == 16\n",
+    "assert impl._fusion_hidden_dim == 5120\n",
+    "assert impl._fusion_q_size == 24 * 256\n",
+    "\n",
+    "impl._resolve_fusion_weights()\n",
+    "assert impl._fusion_bound is True\n",
+    "assert impl.wo_weight is o.weight\n",
+    "assert impl.wo_global_scale is o.weight_global_scale\n",
+    "assert impl.rmsnorm_gamma is n.weight\n",
+    "assert impl.rmsnorm_eps == 1e-6\n",
+    "print(\"Test 1 PASS\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "72eea5119410473aa328ad9291626812",
+   "metadata": {},
+   "source": [
+    "## Test 2 \u2014 BF16 skip-path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8edb47106e1a46a883d545849b8ab81b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "impl = fake_impl()\n",
+    "o = bf16_o_proj()\n",
+    "n = post_norm()\n",
+    "p = parent_layer(o, n)\n",
+    "\n",
+    "with (\n",
+    "    patch(\"vllm.config.get_current_vllm_config\", return_value=mock_cfg(16)),\n",
+    "    patch.object(CutePagedAttentionImpl, \"_preallocate_fusion_buffers\", _cpu_prealloc),\n",
+    "):\n",
+    "    impl.attach_fusion(p)\n",
+    "\n",
+    "impl._resolve_fusion_weights()  # must not raise AttributeError\n",
+    "assert impl._fusion_bound is False\n",
+    "assert impl._fusion_attached is True\n",
+    "print(\"Test 2 PASS\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "10185d26023b46108eb7d9f57d49d2b3",
+   "metadata": {},
+   "source": [
+    "## Test 3 \u2014 Double-resolve rebinds to fresh tensor identity"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8763a12b2bbd4a93a75aff182afb95dc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "impl = fake_impl()\n",
+    "o = nvfp4_o_proj()\n",
+    "n = post_norm()\n",
+    "p = parent_layer(o, n)\n",
+    "\n",
+    "with (\n",
+    "    patch(\"vllm.config.get_current_vllm_config\", return_value=mock_cfg(16)),\n",
+    "    patch.object(CutePagedAttentionImpl, \"_preallocate_fusion_buffers\", _cpu_prealloc),\n",
+    "):\n",
+    "    impl.attach_fusion(p)\n",
+    "\n",
+    "impl._resolve_fusion_weights()\n",
+    "old_gs = impl.wo_global_scale\n",
+    "\n",
+    "o.weight_global_scale = torch.nn.Parameter(torch.tensor([2.0], dtype=torch.float32))\n",
+    "\n",
+    "impl._resolve_fusion_weights()\n",
+    "assert impl.wo_global_scale is o.weight_global_scale\n",
+    "assert impl.wo_global_scale is not old_gs\n",
+    "assert impl.wo_global_scale.item() == 2.0\n",
+    "print(\"Test 3 PASS\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7623eae2785240b9bd12b16a66d81610",
+   "metadata": {},
+   "source": [
+    "## Test 4 \u2014 Buffer pointer stability across attach calls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7cdc8c89c7104fffa095e18ddfef8986",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "impl = fake_impl()\n",
+    "o = nvfp4_o_proj()\n",
+    "n = post_norm()\n",
+    "p = parent_layer(o, n)\n",
+    "\n",
+    "with (\n",
+    "    patch(\"vllm.config.get_current_vllm_config\", return_value=mock_cfg(16)),\n",
+    "    patch.object(CutePagedAttentionImpl, \"_preallocate_fusion_buffers\", _cpu_prealloc),\n",
+    "):\n",
+    "    impl.attach_fusion(p)\n",
+    "\n",
+    "ptrs_before = (\n",
+    "    impl.wo_output.data_ptr(),\n",
+    "    impl.rmsnorm_output.data_ptr(),\n",
+    "    impl.gate_buf.data_ptr(),\n",
+    ")\n",
+    "\n",
+    "with (\n",
+    "    patch(\"vllm.config.get_current_vllm_config\", return_value=mock_cfg(16)),\n",
+    "    patch.object(CutePagedAttentionImpl, \"_preallocate_fusion_buffers\", _cpu_prealloc),\n",
+    "):\n",
+    "    impl.attach_fusion(p)\n",
+    "\n",
+    "ptrs_after = (\n",
+    "    impl.wo_output.data_ptr(),\n",
+    "    impl.rmsnorm_output.data_ptr(),\n",
+    "    impl.gate_buf.data_ptr(),\n",
+    ")\n",
+    "\n",
+    "assert ptrs_before == ptrs_after, f\"{ptrs_before} != {ptrs_after}\"\n",
+    "print(\"Test 4 PASS\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b118ea5561624da68c537baed56e602f",
+   "metadata": {},
+   "source": [
+    "## Test 5 \u2014 Per-forward gate boundary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "938c804e27f84196a10c8828c723f798",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "impl = fake_impl()\n",
+    "o = nvfp4_o_proj()\n",
+    "n = post_norm()\n",
+    "p = parent_layer(o, n)\n",
+    "\n",
+    "with (\n",
+    "    patch(\"vllm.config.get_current_vllm_config\", return_value=mock_cfg(16)),\n",
+    "    patch.object(CutePagedAttentionImpl, \"_preallocate_fusion_buffers\", _cpu_prealloc),\n",
+    "):\n",
+    "    impl.attach_fusion(p)\n",
+    "\n",
+    "impl._resolve_fusion_weights()\n",
+    "assert impl._fusion_bound is True\n",
+    "\n",
+    "\n",
+    "def gate_decision(nat, is_decode_only):\n",
+    "    fits = nat <= getattr(impl, \"_fusion_max_num_seqs\", 0)\n",
+    "    return impl._fusion_bound and is_decode_only and fits\n",
+    "\n",
+    "\n",
+    "assert gate_decision(8, True) is True, \"decode within cap -> fuse\"\n",
+    "assert gate_decision(17, True) is False, \"decode over cap -> no fuse (A3)\"\n",
+    "assert gate_decision(8, False) is False, \"prefill -> no fuse\"\n",
+    "print(\"Test 5 PASS\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
\ No newline at end of file
diff --git a/scripts/fusion_phaseb_diff.py b/scripts/fusion_phaseb_diff.py
new file mode 100644
index 000000000000..430b63ea3f37
--- /dev/null
+++ b/scripts/fusion_phaseb_diff.py
@@ -0,0 +1,177 @@
+"""Phase B W_O GEMV — math isolation harness.
+
+Compares three views of the W_O dequant+GEMV computation:
+  (1) helpers.nvfp4_dequant from raw (unswizzled) scales
+  (2) our emulator using the kernel's swizzled-load indexing
+  (3) full matmul reference: attn @ W_dq.T vs per-CTA K-slice accumulation
+
+If (1) == (2) and matmul == per-CTA sum, the kernel's Phase B formulas are correct,
+and the remaining bug is in runtime (CuTe compile-time specialization or sync).
+
+Run: .venv/bin/python scripts/fusion_phaseb_diff.py
+"""
+
+from __future__ import annotations
+
+import sys
+
+import torch
+
+sys.path.insert(0, "/home/natfii/docker/nvllm")
+sys.path.insert(0, "/home/natfii/.claude/skills/kernel-math-debug")
+
+from helpers import (  # type: ignore  # noqa: E402
+    compare,
+    nvfp4_dequant,
+)
+from safetensors import safe_open  # noqa: E402
+
+from vllm.model_executor.layers.quantization.utils.nvfp4_utils import (  # noqa: E402
+    swizzle_blockscale,
+)
+
+MODEL = (
+    "/home/natfii/.cache/huggingface/hub/"
+    "models--natfii--Qwen3.5-27B-NVFP4-Opus-GB10/snapshots/"
+    "1496fc6e90a170fe575051d292284ecaa7053b6b/model.safetensors"
+)
+LAYER = 3
+
+NUM_Q_HEADS = 24
+NUM_KV_HEADS = 4
+HEAD_DIM = 256
+GROUP_SIZE = NUM_Q_HEADS // NUM_KV_HEADS  # 6
+HIDDEN_DIM = 5120
+K_DIM = NUM_Q_HEADS * HEAD_DIM  # 6144
+NUM_K_GROUPS = K_DIM // 16  # 384
+NUM_K_TILES = (NUM_K_GROUPS + 3) // 4  # 96 — matches kernel wo_nkt
+
+
+def kernel_swizzled_scale_offset(m: int, k_group: int, num_k_tiles: int) -> int:
+    """Mirror the kernel's _ld_swizzled_scale offset math."""
+    m_tile = m >> 7
+    outer_m = m & 31
+    inner_m = (m >> 5) & 3
+    k_tile = k_group >> 2
+    inner_k = k_group & 3
+    return (m_tile * num_k_tiles + k_tile) * 512 + outer_m * 16 + inner_m * 4 + inner_k
+
+
+def main() -> None:
+    print("== Loading layer", LAYER, "o_proj from checkpoint ==")
+    with safe_open(MODEL, framework="pt", device="cuda") as sf:
+        W_packed = sf.get_tensor(f"model.layers.{LAYER}.self_attn.o_proj.weight_packed")
+        S_raw = sf.get_tensor(f"model.layers.{LAYER}.self_attn.o_proj.weight_scale")
+        GS_stored = sf.get_tensor(
+            f"model.layers.{LAYER}.self_attn.o_proj.weight_global_scale"
+        )
+
+    print(f"  W_packed: {list(W_packed.shape)} {W_packed.dtype}")
+    print(f"  S_raw: {list(S_raw.shape)} {S_raw.dtype}")
+    print(f"  GS_stored (divisor): {GS_stored.item():.6f}")
+
+    # Mirror process_weights_after_loading
+    S_swizzled = swizzle_blockscale(S_raw)
+    GS = (1.0 / GS_stored.max().to(torch.float32)).to(torch.float32)
+    print(f"  S_swizzled: {list(S_swizzled.shape)} {S_swizzled.dtype}")
+    print(f"  GS (true weight_global_scale): {GS.item():.6f}\n")
+
+    # Reference dequant from raw scales
+    print("== Reference dequant (helpers, raw scales) ==")
+    W_dq_ref = nvfp4_dequant(W_packed, S_raw, GS).to("cuda")
+    print(f"  W_dq_ref: {list(W_dq_ref.shape)} absmax={W_dq_ref.abs().max().item():.4f}\n")
+
+    # --- 1. Scale load sanity: swizzled index vs raw index ---
+    print("== Scale-load sanity: swizzled offset vs raw[n, kg] ==")
+    S_swizzled_flat = S_swizzled.reshape(-1).to("cuda")
+    S_raw_cuda = S_raw.to("cuda")
+
+    mismatches = 0
+    torch.manual_seed(0)
+    sample_pairs = [(0, 0), (0, 5), (100, 50), (5119, 383), (2000, 200),
+                    (128, 0), (128, 4), (127, 3), (63, 383)]
+    for n, kg in sample_pairs:
+        off = kernel_swizzled_scale_offset(n, kg, NUM_K_TILES)
+        sw = S_swizzled_flat[off].float().item()
+        raw = S_raw_cuda[n, kg].float().item()
+        match = abs(sw - raw) < 1e-6
+        if not match:
+            mismatches += 1
+        print(f"  n={n:4d} kg={kg:3d}  swizzled={sw:.6f}  raw={raw:.6f}  match={match}")
+    if mismatches:
+        print(f"\n  !! {mismatches} scale mismatches — swizzle/offset math disagree.\n")
+    else:
+        print("  ✓ all sample (n, kg) pairs match across swizzle and raw\n")
+
+    # --- 2. Full-matrix scale reconstruction via swizzle path ---
+    print("== Full scale reconstruction via kernel swizzle path ==")
+    # Vectorized equivalent of kernel_swizzled_scale_offset over all (n, kg).
+    n_idx = torch.arange(HIDDEN_DIM, device="cuda")
+    k_idx = torch.arange(NUM_K_GROUPS, device="cuda")
+    N, K = torch.meshgrid(n_idx, k_idx, indexing="ij")
+    m_tile = N >> 7
+    outer_m = N & 31
+    inner_m = (N >> 5) & 3
+    k_tile = K >> 2
+    inner_k = K & 3
+    sf_offset = (m_tile * NUM_K_TILES + k_tile) * 512 + outer_m * 16 + inner_m * 4 + inner_k
+    sf_from_swizzle = S_swizzled_flat[sf_offset.view(-1)].view(HIDDEN_DIM, NUM_K_GROUPS).float()
+    compare(sf_from_swizzle, S_raw_cuda.float(), name="scale matrix (swizzle-path vs raw)")
+    print()
+
+    # --- 3. Dequant via kernel indexing (swizzle path) ---
+    print("== W dequant via kernel indexing (swizzle path) ==")
+    W_packed_cuda = W_packed.to("cuda")
+    low_nib = (W_packed_cuda & 0x0F).to(torch.int64)
+    high_nib = ((W_packed_cuda >> 4) & 0x0F).to(torch.int64)
+    W_nibbles = torch.empty(HIDDEN_DIM, K_DIM, dtype=torch.int64, device="cuda")
+    W_nibbles[:, 0::2] = low_nib
+    W_nibbles[:, 1::2] = high_nib
+
+    fp4_lut = torch.tensor(
+        [0.0, 0.5, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0,
+         -0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0],
+        dtype=torch.float32,
+        device="cuda",
+    )
+    W_fp = fp4_lut[W_nibbles]
+    sf_expanded = sf_from_swizzle.repeat_interleave(16, dim=1)
+    W_dq_emu = W_fp * sf_expanded * GS.item()
+    compare(W_dq_emu, W_dq_ref, name="W_dq kernel-swizzle vs helpers raw")
+    print()
+
+    # --- 4. Full GEMV accumulation: monolithic matmul vs per-CTA K-slice sum ---
+    print("== Full GEMV: matmul vs per-CTA K-slice accumulation ==")
+    torch.manual_seed(0)
+    NUM_SEQS = 2
+    attn_output = (
+        torch.randn(NUM_SEQS, NUM_Q_HEADS, HEAD_DIM, dtype=torch.bfloat16, device="cuda")
+        * 0.1
+    )
+    attn_flat = attn_output.view(NUM_SEQS, K_DIM).float()
+
+    ref_wo_full = attn_flat @ W_dq_emu.T
+
+    emu_wo = torch.zeros(NUM_SEQS, HIDDEN_DIM, dtype=torch.float32, device="cuda")
+    for kv_head_idx in range(NUM_KV_HEADS):
+        k_start = kv_head_idx * GROUP_SIZE * HEAD_DIM
+        k_end = k_start + GROUP_SIZE * HEAD_DIM
+        partial = attn_flat[:, k_start:k_end] @ W_dq_emu[:, k_start:k_end].T
+        emu_wo += partial
+    compare(emu_wo, ref_wo_full, name="GEMV per-CTA sum vs single matmul")
+    print()
+
+    # --- 5. Final: emulator vs helpers reference end-to-end ---
+    print("== End-to-end: per-CTA emulator vs helpers reference ==")
+    helpers_ref = attn_flat @ W_dq_ref.T
+    compare(emu_wo, helpers_ref, name="per-CTA emulator vs helpers reference")
+    print()
+
+    print("== Summary ==")
+    print("If all three checks are 'close=True', Phase B formulas are correct.")
+    print("The remaining bug is runtime — CuTe DSL specializing on Int32 fusion flags,")
+    print("or Phase A→B memory visibility, or actual launch path.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/serve-cute.sh b/scripts/serve-cute.sh
index 78b11383e1e7..41999a3a0aeb 100755
--- a/scripts/serve-cute.sh
+++ b/scripts/serve-cute.sh
@@ -68,6 +68,7 @@ docker run -d \
   -e VLLM_NVFP4_GEMM_BACKEND=cutlass \
   -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
   -e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+  -e CUTE_DEBUG_FUSION="${CUTE_DEBUG_FUSION:-0}" \
   "$NVLLM_IMAGE" \
   serve \
   --model "$HF_MODEL" \
diff --git a/tests/compile/test_dynamic_shapes_compilation.py b/tests/compile/test_dynamic_shapes_compilation.py
index bbd62237c5e8..1775b2c9debc 100644
--- a/tests/compile/test_dynamic_shapes_compilation.py
+++ b/tests/compile/test_dynamic_shapes_compilation.py
@@ -222,3 +222,47 @@ def test(model_class, input1, input2, is_01_specialization=False):
         torch.randn(1, 10).cuda(),
         is_01_specialization=True,
     )
+
+
+@pytest.mark.skipif(not is_torch_equal_or_newer("2.10.0"), reason="requires torch 2.10")
+def test_piecewise_backend_empty_sym_shape_indices():
+    """Test that PiecewiseBackend handles empty sym_shape_indices correctly.
+
+    When all inputs have static shapes (no torch.SymInt), sym_shape_indices
+    will be empty. The fix in PiecewiseBackend.__call__ handles this case
+    by using the first compiled range_entry.
+    """
+    gc.collect()
+    torch.accelerator.empty_cache()
+    torch.accelerator.synchronize()
+
+    # Use small max_model_len and max_num_batched_tokens to encourage
+    # static shape compilation with empty sym_shape_indices
+    llm = LLM(
+        model="Qwen/Qwen3-0.6B",
+        max_model_len=512,
+        max_num_batched_tokens=1,
+        compilation_config={
+            "mode": CompilationMode.VLLM_COMPILE,
+            "dynamic_shapes_config": {
+                "type": DynamicShapesType.BACKED.value,
+            },
+        },
+    )
+
+    sampling_params = SamplingParams(temperature=0, top_p=0.95, max_tokens=10)
+
+    # Generate with static shape inputs
+    output = llm.generate("Hello, my name is", sampling_params=sampling_params)
+    result = output[0].outputs[0].text
+    assert len(result) > 0, "Should generate non-empty output"
+
+    # Generate again to verify compilation works with empty sym_shape_indices
+    output = llm.generate("The capital of France is", sampling_params=sampling_params)
+    result = output[0].outputs[0].text
+    assert len(result) > 0, "Should generate non-empty output on second run"
+
+    del llm
+    gc.collect()
+    torch.accelerator.empty_cache()
+    torch.accelerator.synchronize()
diff --git a/tools/pre_commit/mypy.py b/tools/pre_commit/mypy.py
index 1ba1f81564cc..07e84ffb38fb 100755
--- a/tools/pre_commit/mypy.py
+++ b/tools/pre_commit/mypy.py
@@ -35,6 +35,7 @@
 # TODO(woosuk): Include the code from Megatron and HuggingFace.
 EXCLUDE = [
     "vllm/model_executor/models",
+    "vllm/nvllm/models",
     "vllm/model_executor/layers/fla/ops",
     # Ignore triton kernels in ops.
     "vllm/v1/attention/ops",
diff --git a/vllm/compilation/piecewise_backend.py b/vllm/compilation/piecewise_backend.py
index 7474d0bf841b..4658724d8566 100644
--- a/vllm/compilation/piecewise_backend.py
+++ b/vllm/compilation/piecewise_backend.py
@@ -353,12 +353,22 @@ def _find_range_for_shape(self, runtime_shape: int) -> RangeEntry | None:
         return None
 
     def __call__(self, *args: Any) -> Any:
-        runtime_shape = args[self.sym_shape_indices[0]]
-        range_entry = self._find_range_for_shape(runtime_shape)
+        if self.sym_shape_indices:
+            runtime_shape = args[self.sym_shape_indices[0]]
+            range_entry = self._find_range_for_shape(runtime_shape)
+            assert range_entry is not None, (
+                f"Shape: {runtime_shape} out of considered ranges: "
+                f"{self.compile_ranges}"
+            )
+        else:
+            # All inputs have static shapes; use the only compiled range_entry
+            compiled_entries = [re for re in self.range_entries.values() if re.compiled]
+            assert len(compiled_entries) == 1, (
+                f"Expected exactly one compiled range_entry for static shape "
+                f"compilation, but found {len(compiled_entries)}"
+            )
+            range_entry = compiled_entries[0]
 
-        assert range_entry is not None, (
-            f"Shape: {runtime_shape} out of considered ranges: {self.compile_ranges}"
-        )
         assert range_entry.compiled, (
             "All ranges should be compiled or loaded up front in "
             "PiecewiseBackend.__init__. "
diff --git a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
index 26409804c48d..0d47b0f31748 100644
--- a/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
+++ b/vllm/model_executor/layers/fused_moe/flashinfer_cutlass_moe.py
@@ -130,7 +130,14 @@ def _supports_current_device() -> bool:
                 p.is_device_capability(90)
                 or p.is_device_capability_family(100)
                 or p.is_device_capability_family(110)
-                or p.is_device_capability_family(120)
+                or p.is_device_capability(120)
+                # NOTE: SM121 (DGX Spark) is excluded because the bf16
+                # unquantized CUTLASS MoE GEMM in flashinfer <= 0.6.7 has no
+                # Relu2 template instantiation and throws "Invalid activation
+                # type" on Nemotron-H. Fixed upstream by
+                # https://github.com/flashinfer-ai/flashinfer/pull/2926
+                # (merged 2026-04-01, not yet in a stable release); lift this
+                # restriction once flashinfer >= 0.6.8 is the minimum.
             )
             and has_flashinfer_cutlass_fused_moe()
         )
diff --git a/vllm/model_executor/models/qwen3_5.py b/vllm/model_executor/models/qwen3_5.py
index 46b10d2e8b46..d89ff0c9b2fe 100644
--- a/vllm/model_executor/models/qwen3_5.py
+++ b/vllm/model_executor/models/qwen3_5.py
@@ -1,898 +1,15 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Copyright contributors to the nvllm fork
+"""Shim: Qwen3.5 model surface moved to `vllm.nvllm.models.qwen3_5`.
 
-# Copyright 2025 The vLLM team.
-# Copyright 2025 The Qwen Team.
-# Copyright 2025 The HuggingFace Inc. team.
-# All rights reserved.
-#
-# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
-# and OPT implementations in this library. It has been modified from its
-# original forms to accommodate minor architectural differences compared
-# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""Inference-only Qwen3.5 Series compatible with HuggingFace weights."""
+This shim re-exports every public symbol so the upstream registry
+(`vllm/model_executor/models/registry.py:1283-1284` hardcodes the
+`vllm.model_executor.models.` prefix), `vllm/model_executor/models/colqwen3_5.py`,
+and `vllm/model_executor/models/qwen3_5_mtp.py` continue to resolve
+their existing import paths without edits.
 
-import typing
-from collections.abc import Callable, Iterable
+See `vllm/nvllm/README.md` for the ownership boundary.
+"""
 
-import torch
-from torch import nn
-
-from vllm.compilation.decorators import support_torch_compile
-from vllm.config import VllmConfig
-from vllm.distributed import (
-    get_pp_group,
-)
-from vllm.logger import init_logger
-from vllm.model_executor.layers.layernorm import (
-    GemmaRMSNorm as Qwen3_5RMSNorm,
-)
-from vllm.model_executor.layers.logits_processor import LogitsProcessor
-from vllm.model_executor.layers.mamba.gdn_linear_attn import GatedDeltaNetAttention
-from vllm.model_executor.layers.mamba.mamba_utils import (
-    MambaStateCopyFunc,
-    MambaStateCopyFuncCalculator,
-    MambaStateDtypeCalculator,
-    MambaStateShapeCalculator,
-)
-from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead,
-    VocabParallelEmbedding,
-)
-from vllm.model_executor.model_loader.weight_utils import (
-    default_weight_loader,
-    maybe_remap_kv_scale_name,
-)
-from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.sequence import IntermediateTensors
-from vllm.transformers_utils.configs.qwen3_5 import (
-    Qwen3_5Config,
-    Qwen3_5TextConfig,
-)
-from vllm.transformers_utils.configs.qwen3_5_moe import (
-    Qwen3_5MoeConfig,
-    Qwen3_5MoeTextConfig,
-)
-
-from .interfaces import (
-    HasInnerState,
-    IsHybrid,
-    MixtureOfExperts,
-    MultiModalEmbeddings,
-    SupportsEagle3,
-    SupportsLoRA,
-    SupportsPP,
-    _require_is_multimodal,
-)
-from .qwen2_moe import Qwen2MoeMLP as Qwen3NextMLP
-from .qwen3_next import (
-    Qwen3NextAttention,
-    Qwen3NextDecoderLayer,
-    Qwen3NextModel,
-    Qwen3NextSparseMoeBlock,
-    QwenNextMixtureOfExperts,
-)
-from .qwen3_vl import (
-    Qwen3_VisionTransformer,
-    Qwen3VLDummyInputsBuilder,
-    Qwen3VLForConditionalGeneration,
-    Qwen3VLMultiModalProcessor,
-    Qwen3VLProcessingInfo,
-)
-from .utils import (
-    AutoWeightsLoader,
-    PPMissingLayer,
-    _merge_multimodal_embeddings,
-    extract_layer_index,
-    is_pp_missing_parameter,
-    make_empty_intermediate_tensors_factory,
-    make_layers,
-    maybe_prefix,
-)
-
-logger = init_logger(__name__)
-
-
-class Qwen3_5ProcessingInfo(Qwen3VLProcessingInfo):
-    def get_hf_config(self):
-        return self.ctx.get_hf_config(Qwen3_5Config)
-
-
-class Qwen3_5MoeProcessingInfo(Qwen3VLProcessingInfo):
-    def get_hf_config(self):
-        return self.ctx.get_hf_config(Qwen3_5MoeConfig)
-
-
-class Qwen3_5DecoderLayer(Qwen3NextDecoderLayer):
-    def __init__(
-        self,
-        vllm_config: VllmConfig,
-        layer_type: str,
-        prefix: str = "",
-    ) -> None:
-        super(Qwen3NextDecoderLayer, self).__init__()
-
-        config = vllm_config.model_config.hf_text_config
-        model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
-        quant_config = vllm_config.quant_config
-
-        self.layer_type = layer_type
-        self.layer_idx = extract_layer_index(prefix)
-
-        if self.layer_type == "linear_attention":
-            self.linear_attn = GatedDeltaNetAttention(
-                config=config,
-                vllm_config=vllm_config,
-                prefix=f"{prefix}.linear_attn",
-                gqa_interleaved_layout=False,
-                create_in_proj_qkvz=vllm_config.lora_config is None,
-            )
-        elif self.layer_type == "full_attention":
-            self.self_attn = Qwen3NextAttention(
-                config,
-                model_config=model_config,
-                cache_config=cache_config,
-                quant_config=quant_config,
-                prefix=f"{prefix}.self_attn",
-            )
-        else:
-            raise ValueError(f"Invalid layer_type {self.layer_type}")
-
-        # NOTE: Determine the MLP type based on the model type
-        # Qwen3.5 use all layers for MLP / Qwen3.5-MoE use sparse MoE blocks
-        if config.model_type == "qwen3_5_moe_text":
-            self.mlp = Qwen3NextSparseMoeBlock(
-                vllm_config=vllm_config,
-                prefix=f"{prefix}.mlp",
-            )
-        elif config.model_type == "qwen3_5_text":
-            self.mlp = Qwen3NextMLP(
-                hidden_size=config.hidden_size,
-                intermediate_size=config.intermediate_size,
-                hidden_act=config.hidden_act,
-                quant_config=quant_config,
-                prefix=f"{prefix}.mlp",
-            )
-        else:
-            raise ValueError(f"Invalid model_type {config.model_type}")
-
-        self.input_layernorm = Qwen3_5RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
-        self.post_attention_layernorm = Qwen3_5RMSNorm(
-            config.hidden_size, eps=config.rms_norm_eps
-        )
-
-        self.layer_scale = getattr(config, "layer_scale", False)
-        if self.layer_scale:
-            self.attn_layer_scale = torch.nn.Parameter(
-                torch.zeros(
-                    1,
-                    1,
-                    config.hidden_size,
-                ),
-            )
-            self.ffn_layer_scale = torch.nn.Parameter(
-                torch.zeros(
-                    1,
-                    1,
-                    config.hidden_size,
-                ),
-            )
-
-        # Fusion binding happens in _try_bind_fusion() (inherited from
-        # Qwen3NextDecoderLayer) after weights are loaded on first forward.
-        # Must set here because super().__init__ skips Qwen3NextDecoderLayer.
-        self._max_num_seqs = vllm_config.scheduler_config.max_num_seqs
-        self._fusion_bound = False
-
-
-@support_torch_compile(
-    dynamic_arg_dims={
-        "input_ids": 0,
-        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
-        # otherwise (seq_len, ).
-        "positions": -1,
-        "intermediate_tensors": 0,
-        "inputs_embeds": 0,
-    }
-)
-class Qwen3_5Model(Qwen3NextModel):
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super(Qwen3NextModel, self).__init__()
-
-        config: Qwen3_5TextConfig | Qwen3_5MoeTextConfig = (
-            vllm_config.model_config.hf_text_config
-        )
-        parallel_config = vllm_config.parallel_config
-
-        eplb_config = parallel_config.eplb_config
-        self.num_redundant_experts = eplb_config.num_redundant_experts
-
-        self.config = config
-        self.enable_lora = vllm_config.lora_config is not None
-
-        self.vocab_size = config.vocab_size
-
-        self.embed_tokens = VocabParallelEmbedding(
-            self.vocab_size,
-            config.hidden_size,
-        )
-
-        def get_layer(prefix: str):
-            return Qwen3_5DecoderLayer(
-                vllm_config,
-                layer_type=config.layer_types[extract_layer_index(prefix)],
-                prefix=prefix,
-            )
-
-        self.start_layer, self.end_layer, self.layers = make_layers(
-            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
-        )
-        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
-            ["hidden_states", "residual"], config.hidden_size
-        )
-
-        if get_pp_group().is_last_rank:
-            self.norm = Qwen3_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
-        else:
-            self.norm = PPMissingLayer()
-
-        self.aux_hidden_state_layers: tuple[int, ...] = ()
-
-    def load_fused_expert_weights(
-        self,
-        name: str,
-        params_dict: dict,
-        loaded_weight: torch.Tensor,
-        shard_id: str,
-        num_experts: int,
-    ) -> bool:
-        param = params_dict[name]
-        weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
-        loaded_local_expert = False
-        for expert_id in range(num_experts):
-            curr_expert_weight = loaded_weight[expert_id]
-            success = weight_loader(
-                param,
-                curr_expert_weight,
-                name,
-                shard_id,
-                expert_id,
-                return_success=True,
-            )
-            if success:
-                loaded_local_expert = True
-
-        return loaded_local_expert
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        stacked_params_mapping = [
-            # (param_name, shard_name, shard_id)
-            # self attention
-            ("qkv_proj", "q_proj", "q"),
-            ("qkv_proj", "k_proj", "k"),
-            ("qkv_proj", "v_proj", "v"),
-            # mlp
-            ("gate_up_proj", "gate_proj", 0),
-            ("gate_up_proj", "up_proj", 1),
-            ("in_proj_ba", "in_proj_b", 0),
-            ("in_proj_ba", "in_proj_a", 1),
-        ]
-
-        if self.enable_lora:
-            stacked_params_mapping.extend(
-                [
-                    ("in_proj_qkv", "in_proj_qkv", (0, 1, 2)),
-                    ("in_proj_z", "in_proj_z", 0),
-                ]
-            )
-        else:
-            stacked_params_mapping.extend(
-                [
-                    ("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
-                    ("in_proj_qkvz", "in_proj_z", 3),
-                ]
-            )
-
-        params_dict = dict(self.named_parameters())
-        loaded_params: set[str] = set()
-        expert_params_mapping = self.get_expert_mapping()
-        is_fused_expert = False
-        fused_expert_params_mapping = [
-            ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"),
-            ("experts.w2_weight", "experts.down_proj", 0, "w2"),
-        ]
-        num_experts = (
-            self.config.num_experts if hasattr(self.config, "num_experts") else 0
-        )
-        for name, loaded_weight in weights:
-            if "rotary_emb.inv_freq" in name:
-                continue
-
-            if name.startswith("mtp."):
-                continue
-
-            # Remapping the name of FP8 kv-scale.
-            if name.endswith("scale"):
-                name = maybe_remap_kv_scale_name(name, params_dict)
-                if name is None:
-                    continue
-
-            for param_name, weight_name, shard_id in stacked_params_mapping:
-                if "experts.gate_up_proj" in name or "experts.down_proj" in name:
-                    is_fused_expert = True
-                    expert_params_mapping = fused_expert_params_mapping
-
-                if weight_name not in name:
-                    continue
-
-                if "mlp.experts" in name:
-                    continue
-
-                name = name.replace(weight_name, param_name)
-                # Skip loading extra bias for GPTQ models.
-                if name.endswith(".bias") and name not in params_dict:
-                    continue
-                # Skip layers on other devices.
-                if is_pp_missing_parameter(name, self):
-                    continue
-                # name = apply_attn_prefix(name, params_dict)
-                if name not in params_dict:
-                    continue
-                param = params_dict[name]
-                weight_loader = param.weight_loader
-                if param_name == "in_proj_z" and self.enable_lora:
-                    weight_loader(param, loaded_weight)
-                else:
-                    weight_loader(param, loaded_weight, shard_id)
-                break
-            else:
-                is_expert_weight = False
-                for mapping in expert_params_mapping:
-                    param_name, weight_name, expert_id, shard_id = mapping
-                    if weight_name not in name:
-                        continue
-                    is_expert_weight = True
-                    name_mapped = name.replace(weight_name, param_name)
-                    # Skip layers on other devices.
-                    if is_pp_missing_parameter(name_mapped, self):
-                        continue
-                    if is_fused_expert:
-                        # qwen3.5 no need to transpose
-                        # loaded_weight = loaded_weight.transpose(-1, -2)
-                        if "experts.gate_up_proj" in name:
-                            loaded_weight = loaded_weight.chunk(2, dim=-2)
-                            success_w1 = self.load_fused_expert_weights(
-                                name_mapped,
-                                params_dict,
-                                loaded_weight[0],
-                                "w1",
-                                num_experts,
-                            )
-                            success_w3 = self.load_fused_expert_weights(
-                                name_mapped,
-                                params_dict,
-                                loaded_weight[1],
-                                "w3",
-                                num_experts,
-                            )
-                            success = success_w1 and success_w3
-                        else:
-                            # down_proj
-                            success = self.load_fused_expert_weights(
-                                name_mapped,
-                                params_dict,
-                                loaded_weight,
-                                shard_id,
-                                num_experts,
-                            )
-                        if success:
-                            name = name_mapped
-                            break
-                    else:
-                        # Skip loading extra bias for GPTQ models.
-                        if (
-                            name_mapped.endswith(".bias")
-                            or name_mapped.endswith("_bias")
-                        ) and name_mapped not in params_dict:
-                            continue
-                        param = params_dict[name_mapped]
-                        weight_loader = param.weight_loader
-                        success = weight_loader(
-                            param,
-                            loaded_weight,
-                            name_mapped,
-                            shard_id=shard_id,
-                            expert_id=expert_id,
-                            return_success=True,
-                        )
-                    if success:
-                        name = name_mapped
-                        break
-                else:
-                    if is_expert_weight:
-                        # We've checked that this is an expert weight
-                        # However it's not mapped locally to this rank
-                        # So we simply skip it
-                        continue
-                    # Skip loading extra bias for GPTQ models.
-                    if name.endswith(".bias") and name not in params_dict:
-                        continue
-                    if is_pp_missing_parameter(name, self):
-                        continue
-                    if name not in params_dict:
-                        logger.warning_once(
-                            f"Parameter {name} not found in params_dict, skip loading"
-                        )
-                        continue
-                    param = params_dict[name]
-                    weight_loader = getattr(
-                        param, "weight_loader", default_weight_loader
-                    )
-                    weight_loader(param, loaded_weight)
-            loaded_params.add(name)
-        return loaded_params
-
-
-class Qwen3_5ForCausalLMBase(
-    nn.Module,
-    HasInnerState,
-    IsHybrid,
-    SupportsEagle3,
-    SupportsLoRA,
-    SupportsPP,
-):
-    packed_modules_mapping = {
-        "qkv_proj": [
-            "q_proj",
-            "k_proj",
-            "v_proj",
-        ],
-        "gate_up_proj": ["gate_proj", "up_proj"],
-        # GDN fused projections.
-        "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
-        "in_proj_ba": ["in_proj_b", "in_proj_a"],
-    }
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        config = vllm_config.model_config.hf_text_config
-        self.vllm_config = vllm_config
-        self.model_config = vllm_config.model_config
-        cache_config = vllm_config.cache_config
-
-        scheduler_config = vllm_config.scheduler_config
-        if cache_config.mamba_cache_mode == "all":
-            raise NotImplementedError(
-                "Qwen3.5 currently does not support 'all' prefix caching, "
-                "please use '--mamba-cache-mode=align' instead"
-            )
-        self.quant_config = vllm_config.quant_config
-
-        super().__init__()
-        self.config = config
-        self.scheduler_config = scheduler_config
-        self.model = Qwen3_5Model(
-            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
-        )
-
-        # When LoRA is enabled, GDN uses separate in_proj_qkv and in_proj_z
-        # instead of merged in_proj_qkvz; pack mapping must match.
-        if vllm_config.lora_config:
-            base = getattr(Qwen3_5ForCausalLMBase, "packed_modules_mapping", {})
-            self.packed_modules_mapping = {k: list(v) for k, v in base.items()}
-            self.packed_modules_mapping.pop("in_proj_qkvz", None)
-            self.packed_modules_mapping["in_proj_qkv"] = ["in_proj_qkv"]
-            self.packed_modules_mapping["in_proj_z"] = ["in_proj_z"]
-
-        if get_pp_group().is_last_rank:
-            if config.tie_word_embeddings:
-                self.lm_head = self.model.embed_tokens
-            else:
-                self.lm_head = ParallelLMHead(
-                    config.vocab_size,
-                    config.hidden_size,
-                    prefix=maybe_prefix(prefix, "lm_head"),
-                )
-        else:
-            self.lm_head = PPMissingLayer()
-
-        self.logits_processor = LogitsProcessor(config.vocab_size)
-        self.make_empty_intermediate_tensors = (
-            self.model.make_empty_intermediate_tensors
-        )
-
-    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
-        return self.model.embed_input_ids(input_ids)
-
-    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
-        self.model.aux_hidden_state_layers = layers
-
-    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
-        num_layers = len(self.model.layers)
-        return (2, num_layers // 2, num_layers - 3)
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-        **kwargs: object,
-    ):
-        hidden_states = self.model(
-            input_ids, positions, intermediate_tensors, inputs_embeds
-        )
-
-        return hidden_states
-
-    def compute_logits(
-        self,
-        hidden_states: torch.Tensor,
-    ) -> torch.Tensor | None:
-        return self.logits_processor(self.lm_head, hidden_states)
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=["mtp."],
-        )
-        return loader.load_weights(weights)
-
-    @classmethod
-    def get_mamba_state_dtype_from_config(
-        cls,
-        vllm_config: "VllmConfig",
-    ) -> tuple[torch.dtype, torch.dtype]:
-        return MambaStateDtypeCalculator.gated_delta_net_state_dtype(
-            vllm_config.model_config.dtype,
-            vllm_config.cache_config.mamba_cache_dtype,
-            vllm_config.cache_config.mamba_ssm_cache_dtype,
-        )
-
-    @classmethod
-    def get_mamba_state_shape_from_config(
-        cls, vllm_config: "VllmConfig"
-    ) -> tuple[tuple[int, int], tuple[int, int]]:
-        parallel_config = vllm_config.parallel_config
-        hf_config = vllm_config.model_config.hf_text_config
-        tp_size = parallel_config.tensor_parallel_size
-        num_spec = (
-            vllm_config.speculative_config.num_speculative_tokens
-            if vllm_config.speculative_config
-            else 0
-        )
-        return MambaStateShapeCalculator.gated_delta_net_state_shape(
-            tp_size,
-            hf_config.linear_num_key_heads,
-            hf_config.linear_num_value_heads,
-            hf_config.linear_key_head_dim,
-            hf_config.linear_value_head_dim,
-            hf_config.linear_conv_kernel_dim,
-            num_spec,
-        )
-
-    @classmethod
-    def get_mamba_state_copy_func(
-        cls,
-    ) -> tuple[MambaStateCopyFunc, MambaStateCopyFunc]:
-        return MambaStateCopyFuncCalculator.gated_delta_net_state_copy_func()
-
-
-class Qwen3_5ForCausalLM(Qwen3_5ForCausalLMBase):
-    pass
-    # TODO: Re-enable fusion binding after kernel Phase B/C is fixed.
-    # Must run AFTER process_weights_after_loading (NVFP4 weight
-    # attributes don't exist during load_weights). Can't run in
-    # forward() either — torch._dynamo traces it and chokes on
-    # logger/isinstance calls. Need a post-weight-processing hook.
-    #
-    # def load_weights(self, weights):
-    #     result = super().load_weights(weights)
-    #     for layer in self.model.layers:
-    #         if not layer._fusion_bound:
-    #             layer._fusion_bound = layer._try_bind_fusion()
-    #     return result
-
-
-class Qwen3_5MoeForCausalLM(Qwen3_5ForCausalLMBase, QwenNextMixtureOfExperts):
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
-        super().__init__(vllm_config=vllm_config, prefix=prefix)
-
-        # set MoE hyperparameters
-        self.set_moe_parameters()
-
-    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
-        return self.model.get_expert_mapping()
-
-
-########################################################
-# Qwen3_5-Dense
-########################################################
-
-
-@MULTIMODAL_REGISTRY.register_processor(
-    Qwen3VLMultiModalProcessor,
-    info=Qwen3_5ProcessingInfo,
-    dummy_inputs=Qwen3VLDummyInputsBuilder,
-)
-class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
-    # Qwen3.5 does not support multimodal pruning (EVS).
-    supports_multimodal_pruning = False
-
-    packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | {
-        "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
-        "in_proj_ba": ["in_proj_b", "in_proj_a"],
-    }
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
-        # protocols have not __init__ method, so we need to use nn.Module.__init__
-        nn.Module.__init__(self)
-        self.update_packed_mapping(enable_lora=vllm_config.lora_config is not None)
-        config: Qwen3_5Config = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        multimodal_config = vllm_config.model_config.multimodal_config
-
-        self.config = config
-        self.multimodal_config = multimodal_config
-        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
-        # Qwen3.5 does not support multimodal pruning (EVS).
-        self.is_multimodal_pruning_enabled = False
-
-        with self._mark_tower_model(vllm_config, {"image", "video"}):
-            self.visual = Qwen3_VisionTransformer(
-                config.vision_config,
-                norm_eps=getattr(config, "rms_norm_eps", 1e-6),
-                quant_config=quant_config,
-                prefix=maybe_prefix(prefix, "visual"),
-            )
-
-        with self._mark_language_model(vllm_config):
-            self.language_model = Qwen3_5ForCausalLM(
-                vllm_config=vllm_config, prefix=maybe_prefix(prefix, "language_model")
-            )
-
-        self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors
-        )
-
-    def update_packed_mapping(self, enable_lora: bool):
-        # When LoRA is enabled, GDN uses separate in_proj_qkv and in_proj_z
-        if enable_lora:
-            base = getattr(
-                Qwen3_5ForConditionalGeneration, "packed_modules_mapping", {}
-            )
-            self.packed_modules_mapping = {k: list(v) for k, v in base.items()}
-            self.packed_modules_mapping.pop("in_proj_qkvz", None)
-            self.packed_modules_mapping["in_proj_qkv"] = ["in_proj_qkv"]
-            self.packed_modules_mapping["in_proj_z"] = ["in_proj_z"]
-
-    def embed_input_ids(
-        self,
-        input_ids: torch.Tensor,
-        multimodal_embeddings: MultiModalEmbeddings | None = None,
-        *,
-        is_multimodal: torch.Tensor | None = None,
-    ) -> torch.Tensor:
-        inputs_embeds = self._embed_text_input_ids(
-            input_ids,
-            self.language_model.embed_input_ids,
-            is_multimodal=is_multimodal,
-        )
-
-        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
-            return inputs_embeds
-
-        is_multimodal = _require_is_multimodal(is_multimodal)
-
-        inputs_embeds = _merge_multimodal_embeddings(
-            inputs_embeds=inputs_embeds,
-            multimodal_embeddings=multimodal_embeddings,
-            is_multimodal=is_multimodal,
-        )
-
-        return inputs_embeds
-
-    def recompute_mrope_positions(self, *args, **kwargs):
-        raise NotImplementedError(
-            "Qwen3.5 does not support multimodal pruning (EVS). "
-            "recompute_mrope_positions should never be called."
-        )
-
-    def forward(
-        self,
-        input_ids: torch.Tensor,
-        positions: torch.Tensor,
-        intermediate_tensors: IntermediateTensors | None = None,
-        inputs_embeds: torch.Tensor | None = None,
-        **kwargs: object,
-    ) -> torch.Tensor | IntermediateTensors:
-        """Run forward pass for Qwen3.5.
-
-        Args:
-            input_ids: Flattened (concatenated) input_ids corresponding to a
-                batch.
-            positions: Flattened (concatenated) position ids corresponding to a
-                batch.
-                **NOTE**: If mrope is enabled (default setting for Qwen3VL
-                opensource models), the shape will be `(3, seq_len)`,
-                otherwise it will be `(seq_len,).
-            intermediate_tensors: Intermediate tensors from previous pipeline
-                stages.
-            inputs_embeds: Pre-computed input embeddings.
-            **kwargs: Additional keyword arguments including:
-                - pixel_values: Pixel values to be fed to a model.
-                    `None` if no images are passed.
-                - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in
-                    LLM. `None` if no images are passed.
-                - pixel_values_videos: Pixel values of videos to be fed to a
-                    model. `None` if no videos are passed.
-                - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in
-                    LLM. `None` if no videos are passed.
-        """
-
-        if intermediate_tensors is not None:
-            inputs_embeds = None
-
-        hidden_states = self.language_model.model(
-            input_ids=input_ids,
-            positions=positions,
-            intermediate_tensors=intermediate_tensors,
-            inputs_embeds=inputs_embeds,
-        )
-
-        return hidden_states
-
-    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
-        loader = AutoWeightsLoader(
-            self,
-            skip_prefixes=["mtp."],
-        )
-        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
-
-    @classmethod
-    def get_mamba_state_dtype_from_config(
-        cls,
-        vllm_config: "VllmConfig",
-    ) -> tuple[torch.dtype, torch.dtype]:
-        return MambaStateDtypeCalculator.gated_delta_net_state_dtype(
-            vllm_config.model_config.dtype,
-            vllm_config.cache_config.mamba_cache_dtype,
-            vllm_config.cache_config.mamba_ssm_cache_dtype,
-        )
-
-    @classmethod
-    def get_mamba_state_shape_from_config(
-        cls, vllm_config: "VllmConfig"
-    ) -> tuple[tuple[int, int], tuple[int, int]]:
-        parallel_config = vllm_config.parallel_config
-        hf_config = vllm_config.model_config.hf_text_config
-        tp_size = parallel_config.tensor_parallel_size
-        num_spec = (
-            vllm_config.speculative_config.num_speculative_tokens
-            if vllm_config.speculative_config
-            else 0
-        )
-        return MambaStateShapeCalculator.gated_delta_net_state_shape(
-            tp_size,
-            hf_config.linear_num_key_heads,
-            hf_config.linear_num_value_heads,
-            hf_config.linear_key_head_dim,
-            hf_config.linear_value_head_dim,
-            hf_config.linear_conv_kernel_dim,
-            num_spec,
-        )
-
-    @classmethod
-    def get_mamba_state_copy_func(cls) -> tuple[MambaStateCopyFunc, MambaStateCopyFunc]:
-        return MambaStateCopyFuncCalculator.gated_delta_net_state_copy_func()
-
-
-########################################################
-# Qwen3_5-MoE
-########################################################
-
-
-class Qwen3_5_MoeMixtureOfExperts(MixtureOfExperts):
-    def update_physical_experts_metadata(
-        self,
-        num_physical_experts: int,
-        num_local_physical_experts: int,
-    ) -> None:
-        assert self.num_local_physical_experts == num_local_physical_experts
-        self.num_physical_experts = num_physical_experts
-        self.num_local_physical_experts = num_local_physical_experts
-        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
-        for layer in self.language_model.model.layers:
-            if isinstance(layer.mlp, Qwen3NextSparseMoeBlock):
-                moe = layer.mlp
-                moe.n_local_physical_experts = num_local_physical_experts
-                moe.n_physical_experts = num_physical_experts
-                moe.n_redundant_experts = self.num_redundant_experts
-                moe.experts.update_expert_map()
-
-    def set_moe_parameters(self):
-        self.expert_weights = []
-
-        self.moe_layers = []
-        example_moe = None
-        for layer in self.language_model.model.layers:
-            if isinstance(layer, Qwen3_5DecoderLayer) and isinstance(
-                layer.mlp, Qwen3NextSparseMoeBlock
-            ):
-                example_moe = layer.mlp
-                self.moe_layers.append(layer.mlp.experts)
-
-        if example_moe is None:
-            raise RuntimeError(
-                "No Qwen3_5 layer found in the language_model.model.layers."
-            )
-
-        # Set MoE hyperparameters
-        self.num_moe_layers = len(self.moe_layers)
-        self.num_expert_groups = 1
-        self.num_shared_experts = 0
-        self.num_logical_experts = example_moe.n_logical_experts
-        self.num_physical_experts = example_moe.n_physical_experts
-        self.num_local_physical_experts = example_moe.n_local_physical_experts
-        self.num_routed_experts = example_moe.n_routed_experts
-        self.num_redundant_experts = example_moe.n_redundant_experts
-
-
-@MULTIMODAL_REGISTRY.register_processor(
-    Qwen3VLMultiModalProcessor,
-    info=Qwen3_5MoeProcessingInfo,
-    dummy_inputs=Qwen3VLDummyInputsBuilder,
-)
-class Qwen3_5MoeForConditionalGeneration(
-    Qwen3_5ForConditionalGeneration, Qwen3_5_MoeMixtureOfExperts
-):
-    # For MoE LoRA weights loading
-    is_3d_moe_weight: bool = True
-
-    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
-        # protocols have not __init__ method, so we need to use nn.Module.__init__
-        nn.Module.__init__(self)
-        self.update_packed_mapping(enable_lora=vllm_config.lora_config is not None)
-        config: Qwen3_5MoeConfig = vllm_config.model_config.hf_config
-        quant_config = vllm_config.quant_config
-        multimodal_config = vllm_config.model_config.multimodal_config
-
-        self.config = config
-        self.multimodal_config = multimodal_config
-        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
-        # Qwen3.5 does not support multimodal pruning (EVS).
-        self.is_multimodal_pruning_enabled = False
-
-        with self._mark_tower_model(vllm_config, {"image", "video"}):
-            self.visual = Qwen3_VisionTransformer(
-                config.vision_config,
-                norm_eps=getattr(config, "rms_norm_eps", 1e-6),
-                quant_config=quant_config,
-                prefix=maybe_prefix(prefix, "visual"),
-            )
-
-        with self._mark_language_model(vllm_config):
-            self.language_model = Qwen3_5MoeForCausalLM(
-                vllm_config=vllm_config, prefix=maybe_prefix(prefix, "language_model")
-            )
-
-        self.make_empty_intermediate_tensors = (
-            self.language_model.make_empty_intermediate_tensors
-        )
-
-        # set MoE hyperparameters
-        self.set_moe_parameters()
+from vllm.nvllm.models.qwen3_5 import *  # noqa: F401, F403
diff --git a/vllm/model_executor/models/qwen3_next.py b/vllm/model_executor/models/qwen3_next.py
index 21441c2544ec..6cf386cc8ba2 100644
--- a/vllm/model_executor/models/qwen3_next.py
+++ b/vllm/model_executor/models/qwen3_next.py
@@ -282,7 +282,6 @@ def forward(
         positions: torch.Tensor,
         output: torch.Tensor,
         hidden_states: torch.Tensor,
-        fusion_active: bool = False,
     ):
         qkv, _ = self.qkv_proj(hidden_states)
 
@@ -296,9 +295,7 @@ def forward(
             q = q.reshape(*orig_shape, -1)
             gate = gate.reshape(*orig_shape, -1)
         else:
-            q, k, v = qkv.split(
-                [self.q_size, self.kv_size, self.kv_size], dim=-1)
-            gate = None
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
 
         q = self.q_norm(q.view(-1, self.num_heads, self.head_dim)).view(
             -1, self.num_heads * self.head_dim
@@ -309,31 +306,13 @@ def forward(
 
         q, k = self.rotary_emb(positions, q, k)
 
-        if fusion_active and gate is not None:
-            # Write gate to impl's persistent buffer for kernel fusion.
-            # Kernel does: sigmoid(gate) * attn → W_O GEMV → RMSNorm
-            # Use num_actual_tokens from attn_metadata (not padded shape).
-            from vllm.forward_context import get_forward_context
-            _nat = get_forward_context().attn_metadata[
-                self.attn.layer_name].num_actual_tokens
-            self.attn.impl.gate_buf[:_nat].copy_(gate[:_nat])
-
-        # Signal fusion activation to backend impl. The impl only sends
-        # fusion pointers to the kernel when BOTH _fusion_bound (weights
-        # allocated) AND _fusion_active (this call is fused) are True.
-        self.attn.impl._fusion_active = fusion_active
-
         attn_output = self.attn(q, k, v)
 
-        if not fusion_active:
-            # Unfused path: apply gate and o_proj in Python
-            if self.attn_output_gate and gate is not None:
-                gate = torch.sigmoid(gate)
-                attn_output = attn_output * gate
-            output[:], _ = self.o_proj(attn_output)
-        # When fusion_active, kernel already wrote to impl's persistent
-        # buffers (wo_output, rmsnorm_output, residual_output).
-        # Caller reads from those buffers directly.
+        if self.attn_output_gate:
+            gate = torch.sigmoid(gate)
+            attn_output = attn_output * gate
+
+        output[:], _ = self.o_proj(attn_output)
 
 
 class Qwen3NextDecoderLayer(nn.Module):
@@ -415,44 +394,6 @@ def __init__(
                 ),
             )
 
-        # Fusion binding happens in _try_bind_fusion() after weights are loaded.
-        # Save max_num_seqs now — vllm_config is available during __init__
-        # but NOT during forward (get_current_vllm_config() fails there).
-        self._max_num_seqs = vllm_config.scheduler_config.max_num_seqs
-        self._fusion_bound = False
-
-    def _try_bind_fusion(self) -> bool:
-        """Attempt to bind CuTe fusion weights. Returns True if successful."""
-        if self.layer_type != "full_attention":
-            return False
-
-        from vllm.v1.attention.backends.cute_paged._backend import (
-            CutePagedAttentionImpl,
-        )
-        impl = self.self_attn.attn.impl
-        if not isinstance(impl, CutePagedAttentionImpl):
-            return False
-
-        o_proj = self.self_attn.o_proj
-        if not hasattr(o_proj, 'weight_global_scale'):
-            logger.warning(
-                "CuTe fusion: o_proj weights not loaded yet or not NVFP4, "
-                "skipping fusion binding for layer %d", self.layer_idx)
-            return False
-
-        impl.bind_fusion_weights(
-            wo_weight=o_proj.weight,
-            wo_scales=o_proj.weight_scale,
-            wo_global_scale=o_proj.weight_global_scale,
-            rmsnorm_gamma=self.post_attention_layernorm.weight,
-            rmsnorm_eps=self.post_attention_layernorm.variance_epsilon,
-            max_num_seqs=self._max_num_seqs,
-        )
-
-        logger.info(
-            "CuTe fusion bound for layer %d (full_attention)", self.layer_idx)
-        return True
-
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -460,84 +401,27 @@ def forward(
         positions: torch.Tensor = None,
         **kwargs: object,
     ):
-        # Fusion binding deferred — _try_bind_fusion() contains logger
-        # calls and dynamic Python that break torch._dynamo graph
-        # capture (PIECEWISE mode). Binding is triggered from
-        # process_weights_after_loading() in the model class instead.
-        #
-        # Original lazy-bind (re-enable when CUDA graph capture is sorted):
-        # if not self._fusion_bound:
-        #     self._fusion_bound = self._try_bind_fusion()
-
         if residual is None:
             residual = hidden_states
             hidden_states = self.input_layernorm(hidden_states)
         else:
-            hidden_states, residual = self.input_layernorm(
-                hidden_states, residual)
-
-        num_tokens = hidden_states.shape[0]
-
-        # TODO: Re-enable fusion once Phase B/C kernel is validated.
-        # Infrastructure is ready (bugs 1-5 fixed), kernel math TBD.
-        # Bugs fixed: (1) _fusion_active flag, (2) self-zero race,
-        # (3) padded tensor detection, (4) buffer alloc timing,
-        # (5) padding row zeroing.
-        fusion_active = False
-        nat = num_tokens
-        # --- Fusion guard (uncomment to re-enable) ---
-        # if self._fusion_bound and self.layer_type == "full_attention":
-        #     try:
-        #         from vllm.forward_context import get_forward_context
-        #         ctx = get_forward_context()
-        #         attn_md = ctx.attn_metadata[
-        #             self.self_attn.attn.layer_name]
-        #         nat = attn_md.num_actual_tokens
-        #         is_decode = getattr(attn_md, 'is_decode_only', False)
-        #         fusion_active = is_decode and nat <= self._max_num_seqs
-        #     except (RuntimeError, KeyError, AttributeError, TypeError):
-        #         pass
-        # --- End fusion guard ---
-
-        if fusion_active:
-            # Write residual to impl's persistent buffer for Phase C.
-            # Kernel reads this for: new_residual = residual + wo_output
-            # Use nat (actual tokens), not num_tokens (padded batch).
-            impl = self.self_attn.attn.impl
-            impl.residual_buf[:nat].copy_(residual[:nat])
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
 
         self_attention_output = torch.empty_like(hidden_states)
-
         if self.layer_type == "linear_attention":
             self.linear_attn(
                 hidden_states=hidden_states,
                 output=self_attention_output,
             )
-            hidden_states = self_attention_output
         elif self.layer_type == "full_attention":
             self.self_attn(
                 hidden_states=hidden_states,
                 output=self_attention_output,
                 positions=positions,
-                fusion_active=fusion_active,
             )
-            if fusion_active:
-                # Kernel produced: rmsnorm_output (hidden_states for MLP)
-                # and residual_output (updated residual for next layer).
-                # Skip post_attention_layernorm — kernel did it.
-                # Write INTO the padded tensors at [:nat]. Zero padding
-                # rows [nat:] so the MLP sees zeros (matching unfused
-                # path where attention on zero-padded input → zero).
-                self_attention_output[:nat].copy_(
-                    impl.rmsnorm_output[:nat])
-                if nat < num_tokens:
-                    self_attention_output[nat:].zero_()
-                residual[:nat].copy_(impl.residual_output[:nat])
-                hidden_states = self_attention_output
-            else:
-                hidden_states = self_attention_output
         else:
             raise ValueError("Invalid layer_type")
+        hidden_states = self_attention_output
 
         if self.layer_scale:
             if len(hidden_states.shape) == 2:
@@ -549,12 +433,8 @@ def forward(
                     self.attn_layer_scale.to(hidden_states.dtype) + 1
                 )
 
-        if not fusion_active:
-            # Unfused path: apply post_attention_layernorm in Python
-            hidden_states, residual = self.post_attention_layernorm(
-                hidden_states, residual)
-        # When fusion_active, Phase C already did residual add + RMSNorm
-
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
         hidden_states = self.mlp(hidden_states)
 
         if self.layer_scale:
@@ -563,9 +443,7 @@ def forward(
                     self.ffn_layer_scale.to(hidden_states.dtype)[0] + 1
                 )
             else:
-                assert len(hidden_states.shape) == len(
-                    self.ffn_layer_scale.shape
-                ), (
+                assert len(hidden_states.shape) == len(self.ffn_layer_scale.shape), (
                     f"shape must be the same {len(hidden_states.shape)}, "
                     f"{len(self.ffn_layer_scale.shape)}"
                 )
diff --git a/vllm/nvllm/README.md b/vllm/nvllm/README.md
new file mode 100644
index 000000000000..507b357c7b82
--- /dev/null
+++ b/vllm/nvllm/README.md
@@ -0,0 +1,23 @@
+# nvllm — owned-stack subpackage
+
+Fork-owned code for the CuTe paged attention + fusion stack. Code here does
+NOT subclass upstream model / layer classes: renames in upstream vLLM must
+not silently break fusion wiring.
+
+## Phase B (this subpackage, shipped 2026-04-17)
+
+- `vllm/nvllm/models/qwen3_5.py` — self-contained Qwen3.5 model with
+  `Qwen3_5Attention` inlined from the current fusion-patched
+  `Qwen3NextAttention`. `vllm/model_executor/models/qwen3_5.py` is a 1-line
+  re-export shim so the upstream registry keeps working unchanged.
+
+## Phase C (next, gated before uber-kernel Phase D+E)
+
+- `vllm/nvllm/layers/` for RMSNorm, MLP, embedding. Required before fusion
+  grows to cover MLP / embedding / head.
+
+## Registry
+
+Registry loader at `vllm/model_executor/models/registry.py:1283-1284` hardcodes
+a `vllm.model_executor.models.<mod_relname>` prefix. Rather than modify the
+loader, we ship a shim at the old module path that re-exports everything here.
diff --git a/vllm/nvllm/__init__.py b/vllm/nvllm/__init__.py
new file mode 100644
index 000000000000..bf30f25e2551
--- /dev/null
+++ b/vllm/nvllm/__init__.py
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# SPDX-FileCopyrightText: Copyright contributors to the nvllm fork
+"""nvllm owned-stack subpackage.
+
+Everything under `vllm/nvllm/` is fork-owned code. Upstream-renames to
+`vllm/model_executor/` should not silently break fusion wiring in here.
+See `vllm/nvllm/README.md` for the ownership boundary and roadmap.
+"""
diff --git a/vllm/nvllm/models/__init__.py b/vllm/nvllm/models/__init__.py
new file mode 100644
index 000000000000..208f01a7cb5e
--- /dev/null
+++ b/vllm/nvllm/models/__init__.py
@@ -0,0 +1,2 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
diff --git a/vllm/nvllm/models/qwen3_5.py b/vllm/nvllm/models/qwen3_5.py
new file mode 100644
index 000000000000..83237d7f10f6
--- /dev/null
+++ b/vllm/nvllm/models/qwen3_5.py
@@ -0,0 +1,1201 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+# Copyright 2025 The vLLM team.
+# Copyright 2025 The Qwen Team.
+# Copyright 2025 The HuggingFace Inc. team.
+# All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Inference-only Qwen3.5 Series compatible with HuggingFace weights."""
+
+import typing
+from collections.abc import Callable, Iterable
+from itertools import islice
+
+import torch
+from torch import nn
+
+from vllm.compilation.decorators import support_torch_compile
+from vllm.config import VllmConfig
+from vllm.distributed import (
+    get_pp_group,
+    get_tensor_model_parallel_world_size,
+)
+from vllm.logger import init_logger
+from vllm.model_executor.layers.attention import Attention
+from vllm.model_executor.layers.fused_moe import SharedFusedMoE
+from vllm.model_executor.layers.layernorm import (
+    GemmaRMSNorm as Qwen3_5RMSNorm,
+)
+from vllm.model_executor.layers.linear import (
+    QKVParallelLinear,
+    RowParallelLinear,
+)
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.mamba.gdn_linear_attn import GatedDeltaNetAttention
+from vllm.model_executor.layers.mamba.mamba_utils import (
+    MambaStateCopyFunc,
+    MambaStateCopyFuncCalculator,
+    MambaStateDtypeCalculator,
+    MambaStateShapeCalculator,
+)
+from vllm.model_executor.layers.rotary_embedding import get_rope
+from vllm.model_executor.layers.vocab_parallel_embedding import (
+    ParallelLMHead,
+    VocabParallelEmbedding,
+)
+from vllm.model_executor.model_loader.weight_utils import (
+    default_weight_loader,
+    maybe_remap_kv_scale_name,
+)
+from vllm.model_executor.models.interfaces import (
+    EagleModelMixin,
+    HasInnerState,
+    IsHybrid,
+    MixtureOfExperts,
+    MultiModalEmbeddings,
+    SupportsEagle3,
+    SupportsLoRA,
+    SupportsPP,
+    _require_is_multimodal,
+)
+from vllm.model_executor.models.qwen2_moe import Qwen2MoeMLP as Qwen3NextMLP
+from vllm.model_executor.models.qwen3_next import (
+    Qwen3NextRMSNorm,  # GemmaRMSNorm alias; used for q_norm/k_norm in Qwen3_5Attention
+    Qwen3NextSparseMoeBlock,
+    QwenNextMixtureOfExperts,
+)
+from vllm.model_executor.models.qwen3_vl import (
+    Qwen3_VisionTransformer,
+    Qwen3VLDummyInputsBuilder,
+    Qwen3VLForConditionalGeneration,
+    Qwen3VLMultiModalProcessor,
+    Qwen3VLProcessingInfo,
+)
+from vllm.model_executor.models.utils import (
+    AutoWeightsLoader,
+    PPMissingLayer,
+    _merge_multimodal_embeddings,
+    extract_layer_index,
+    is_pp_missing_parameter,
+    make_empty_intermediate_tensors_factory,
+    make_layers,
+    maybe_prefix,
+)
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.configs.qwen3_5 import (
+    Qwen3_5Config,
+    Qwen3_5TextConfig,
+)
+from vllm.transformers_utils.configs.qwen3_5_moe import (
+    Qwen3_5MoeConfig,
+    Qwen3_5MoeTextConfig,
+)
+
+logger = init_logger(__name__)
+
+
+class Qwen3_5ProcessingInfo(Qwen3VLProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen3_5Config)
+
+
+class Qwen3_5MoeProcessingInfo(Qwen3VLProcessingInfo):
+    def get_hf_config(self):
+        return self.ctx.get_hf_config(Qwen3_5MoeConfig)
+
+
+class Qwen3_5Attention(nn.Module):
+    """Qwen3.5 attention block.
+
+    Inlined copy of Qwen3NextAttention as of fusion-ship commit 37cceaa6c,
+    with the fusion side-channel (`_fusion_active` write, `fusion_active` arg)
+    removed. Impl owns all fusion state; this class only unconditionally writes
+    `gate_buf` when `attn_output_gate=True` and leaves the decision to fuse
+    to `CutePagedAttentionImpl.forward`.
+    """
+
+    def __init__(
+        self,
+        config,
+        model_config,
+        cache_config,
+        quant_config,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        tp_size = get_tensor_model_parallel_world_size()
+        self.total_num_heads = config.num_attention_heads
+        assert self.total_num_heads % tp_size == 0
+        self.num_heads = self.total_num_heads // tp_size
+        self.total_num_kv_heads = config.num_key_value_heads
+        if self.total_num_kv_heads >= tp_size:
+            assert self.total_num_kv_heads % tp_size == 0
+        else:
+            assert tp_size % self.total_num_kv_heads == 0
+        self.num_kv_heads = max(1, self.total_num_kv_heads // tp_size)
+        self.head_dim = config.head_dim or (self.hidden_size // self.num_heads)
+        self.q_size = self.num_heads * self.head_dim
+        self.kv_size = self.num_kv_heads * self.head_dim
+        self.scaling = self.head_dim**-0.5
+        self.dual_chunk_attention_config = getattr(
+            config, "dual_chunk_attention_config", None
+        )
+        self.attn_output_gate = getattr(config, "attn_output_gate", True)
+
+        self.qkv_proj = QKVParallelLinear(
+            config.hidden_size,
+            self.head_dim,
+            self.total_num_heads * (1 + self.attn_output_gate),
+            self.total_num_kv_heads,
+            bias=getattr(config, "qkv_bias", False),
+            quant_config=quant_config,
+            prefix=f"{prefix}.qkv_proj",
+        )
+
+        self.o_proj = RowParallelLinear(
+            self.total_num_heads * self.head_dim,
+            config.hidden_size,
+            bias=False,
+            quant_config=quant_config,
+            prefix=f"{prefix}.o_proj",
+        )
+
+        self.rotary_emb = get_rope(
+            head_size=self.head_dim,
+            max_position=config.max_position_embeddings,
+            rope_parameters=config.rope_parameters,
+            dual_chunk_attention_config=self.dual_chunk_attention_config,
+        )
+
+        self.attn = Attention(
+            self.num_heads,
+            self.head_dim,
+            self.scaling,
+            num_kv_heads=self.num_kv_heads,
+            cache_config=cache_config,
+            quant_config=quant_config,
+            prefix=f"{prefix}.attn",
+            **{
+                "layer_idx": extract_layer_index(prefix),
+                "dual_chunk_attention_config": self.dual_chunk_attention_config,
+            }
+            if self.dual_chunk_attention_config
+            else {},
+        )
+
+        self.q_norm = Qwen3NextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.k_norm = Qwen3NextRMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        positions: torch.Tensor,
+        output: torch.Tensor,
+        hidden_states: torch.Tensor,
+    ):
+        qkv, _ = self.qkv_proj(hidden_states)
+
+        if self.attn_output_gate:
+            q_gate, k, v = qkv.split(
+                [self.q_size * 2, self.kv_size, self.kv_size], dim=-1
+            )
+            orig_shape = q_gate.shape[:-1]
+            q_gate = q_gate.view(*orig_shape, self.num_heads, -1)
+            q, gate = torch.chunk(q_gate, 2, dim=-1)
+            q = q.reshape(*orig_shape, -1)
+            gate = gate.reshape(*orig_shape, -1)
+        else:
+            q, k, v = qkv.split([self.q_size, self.kv_size, self.kv_size], dim=-1)
+            gate = None
+
+        q = self.q_norm(q.view(-1, self.num_heads, self.head_dim)).view(
+            -1, self.num_heads * self.head_dim
+        )
+        k = self.k_norm(k.view(-1, self.num_kv_heads, self.head_dim)).view(
+            -1, self.num_kv_heads * self.head_dim
+        )
+
+        q, k = self.rotary_emb(positions, q, k)
+
+        # Unconditionally mirror `gate` into impl's persistent buffer so that
+        # CUDA-graph replay sees a stable copy and impl.forward() can choose
+        # to read it when fusion is active. When fusion is disabled the copy
+        # is a cheap one-off BF16 memcpy; it avoids the old model->impl flag
+        # side-channel that was flagged as fragile.
+        if gate is not None:
+            from vllm.forward_context import get_forward_context
+
+            impl = self.attn.impl
+            gate_buf = getattr(impl, "gate_buf", None)
+            if gate_buf is not None:
+                try:
+                    nat = (
+                        get_forward_context()
+                        .attn_metadata[self.attn.layer_name]
+                        .num_actual_tokens
+                    )
+                    gate_buf[:nat].copy_(gate[:nat])
+                except (RuntimeError, KeyError, AttributeError, TypeError):
+                    pass
+
+        attn_output = self.attn(q, k, v)
+
+        # Apply gate + o_proj in Python when the kernel did not fuse them.
+        # `impl._fusion_active` is managed entirely inside impl.forward based
+        # on the per-forward decode+boundary check, NOT set from this method.
+        impl = self.attn.impl
+        if getattr(impl, "_fusion_active", False):
+            # Kernel wrote wo_output / rmsnorm_output / residual_output.
+            # DecoderLayer.forward reads those directly; this class leaves
+            # `output` untouched so the decoder layer can branch on the
+            # same flag and copy from impl buffers.
+            return
+
+        if self.attn_output_gate and gate is not None:
+            gate = torch.sigmoid(gate)
+            attn_output = attn_output * gate
+        output[:], _ = self.o_proj(attn_output)
+
+
+class Qwen3_5DecoderLayer(nn.Module):
+    """Self-contained Qwen3.5 decoder layer.
+
+    No longer subclasses Qwen3NextDecoderLayer. Fusion state lives on impl;
+    this layer calls `impl.attach_fusion(self)` once in __init__.
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        layer_type: str,
+        prefix: str = "",
+    ) -> None:
+        super().__init__()
+
+        config = vllm_config.model_config.hf_text_config
+        model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+        quant_config = vllm_config.quant_config
+
+        self.layer_type = layer_type
+        self.layer_idx = extract_layer_index(prefix)
+        self.prefix = prefix  # needed for MTP opt-out in attach_fusion
+
+        if self.layer_type == "linear_attention":
+            self.linear_attn = GatedDeltaNetAttention(
+                config=config,
+                vllm_config=vllm_config,
+                prefix=f"{prefix}.linear_attn",
+                gqa_interleaved_layout=False,
+                create_in_proj_qkvz=vllm_config.lora_config is None,
+            )
+        elif self.layer_type == "full_attention":
+            self.self_attn = Qwen3_5Attention(
+                config,
+                model_config=model_config,
+                cache_config=cache_config,
+                quant_config=quant_config,
+                prefix=f"{prefix}.self_attn",
+            )
+        else:
+            raise ValueError(f"Invalid layer_type {self.layer_type}")
+
+        # MLP dispatch on model_type (copied from current child, NOT parent).
+        if config.model_type == "qwen3_5_moe_text":
+            self.mlp = Qwen3NextSparseMoeBlock(
+                vllm_config=vllm_config,
+                prefix=f"{prefix}.mlp",
+            )
+        elif config.model_type == "qwen3_5_text":
+            self.mlp = Qwen3NextMLP(
+                hidden_size=config.hidden_size,
+                intermediate_size=config.intermediate_size,
+                hidden_act=config.hidden_act,
+                quant_config=quant_config,
+                prefix=f"{prefix}.mlp",
+            )
+        else:
+            raise ValueError(f"Invalid model_type {config.model_type}")
+
+        self.input_layernorm = Qwen3_5RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+        self.post_attention_layernorm = Qwen3_5RMSNorm(
+            config.hidden_size, eps=config.rms_norm_eps
+        )
+
+        self.layer_scale = getattr(config, "layer_scale", False)
+        if self.layer_scale:
+            self.attn_layer_scale = torch.nn.Parameter(
+                torch.zeros(1, 1, config.hidden_size),
+            )
+            self.ffn_layer_scale = torch.nn.Parameter(
+                torch.zeros(1, 1, config.hidden_size),
+            )
+
+        # Declare fusion intent once. Impl owns state; all gating + rebinding
+        # happens inside CutePagedAttentionImpl.attach_fusion() +
+        # _resolve_fusion_weights(). Pass `self` so impl reads o_proj,
+        # post_attention_layernorm, sizes, and prefix off the live module.
+        if self.layer_type == "full_attention":
+            try:
+                from vllm.v1.attention.backends.cute_paged._backend import (
+                    CutePagedAttentionImpl,
+                )
+
+                impl = self.self_attn.attn.impl
+                if isinstance(impl, CutePagedAttentionImpl):
+                    impl.attach_fusion(self)
+            except (ImportError, AttributeError):
+                pass
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        residual: torch.Tensor | None,
+        positions: torch.Tensor = None,
+        **kwargs: object,
+    ):
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        # Impl decides fusion per-forward. We mirror residual into impl's
+        # persistent buffer unconditionally when fusion could run (full
+        # attention + CuTe impl) so graph-capture sees stable pointers.
+        num_tokens = hidden_states.shape[0]
+        nat = num_tokens
+        impl = None
+        if self.layer_type == "full_attention":
+            impl = self.self_attn.attn.impl
+            fusion_could_run = getattr(impl, "_fusion_bound", False)
+            if fusion_could_run:
+                try:
+                    from vllm.forward_context import get_forward_context
+
+                    attn_md = get_forward_context().attn_metadata[
+                        self.self_attn.attn.layer_name
+                    ]
+                    nat = attn_md.num_actual_tokens
+                    impl.residual_buf[:nat].copy_(residual[:nat])
+                except (RuntimeError, KeyError, AttributeError, TypeError):
+                    pass
+
+        self_attention_output = torch.empty_like(hidden_states)
+
+        if self.layer_type == "linear_attention":
+            self.linear_attn(
+                hidden_states=hidden_states,
+                output=self_attention_output,
+            )
+            hidden_states = self_attention_output
+        elif self.layer_type == "full_attention":
+            self.self_attn(
+                hidden_states=hidden_states,
+                output=self_attention_output,
+                positions=positions,
+            )
+            if impl is not None and getattr(impl, "_fusion_active", False):
+                # Kernel already did gate*attn, W_O GEMV, residual+RMSNorm.
+                self_attention_output[:nat].copy_(impl.rmsnorm_output[:nat])
+                if nat < num_tokens:
+                    self_attention_output[nat:].zero_()
+                residual[:nat].copy_(impl.residual_output[:nat])
+                hidden_states = self_attention_output
+            else:
+                hidden_states = self_attention_output
+        else:
+            raise ValueError("Invalid layer_type")
+
+        if self.layer_scale:
+            if len(hidden_states.shape) == 2:
+                hidden_states = hidden_states * (
+                    self.attn_layer_scale.to(hidden_states.dtype)[0] + 1
+                )
+            else:
+                hidden_states = hidden_states * (
+                    self.attn_layer_scale.to(hidden_states.dtype) + 1
+                )
+
+        if not getattr(impl, "_fusion_active", False):
+            hidden_states, residual = self.post_attention_layernorm(
+                hidden_states, residual
+            )
+
+        hidden_states = self.mlp(hidden_states)
+
+        if self.layer_scale:
+            if len(hidden_states.shape) == 2:
+                hidden_states = hidden_states * (
+                    self.ffn_layer_scale.to(hidden_states.dtype)[0] + 1
+                )
+            else:
+                assert len(hidden_states.shape) == len(self.ffn_layer_scale.shape), (
+                    f"shape must be the same {len(hidden_states.shape)}, "
+                    f"{len(self.ffn_layer_scale.shape)}"
+                )
+                hidden_states = hidden_states * (
+                    self.ffn_layer_scale.to(hidden_states.dtype) + 1
+                )
+
+        return hidden_states, residual
+
+
+@support_torch_compile(
+    dynamic_arg_dims={
+        "input_ids": 0,
+        # positions is of shape (3, seq_len) if mrope is enabled for qwen2-vl,
+        # otherwise (seq_len, ).
+        "positions": -1,
+        "intermediate_tensors": 0,
+        "inputs_embeds": 0,
+    }
+)
+class Qwen3_5Model(nn.Module, EagleModelMixin):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__()
+
+        config: Qwen3_5TextConfig | Qwen3_5MoeTextConfig = (
+            vllm_config.model_config.hf_text_config
+        )
+        parallel_config = vllm_config.parallel_config
+
+        eplb_config = parallel_config.eplb_config
+        self.num_redundant_experts = eplb_config.num_redundant_experts
+
+        self.config = config
+        self.enable_lora = vllm_config.lora_config is not None
+
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = VocabParallelEmbedding(
+            self.vocab_size,
+            config.hidden_size,
+        )
+
+        def get_layer(prefix: str):
+            return Qwen3_5DecoderLayer(
+                vllm_config,
+                layer_type=config.layer_types[extract_layer_index(prefix)],
+                prefix=prefix,
+            )
+
+        self.start_layer, self.end_layer, self.layers = make_layers(
+            config.num_hidden_layers, get_layer, prefix=f"{prefix}.layers"
+        )
+        self.make_empty_intermediate_tensors = make_empty_intermediate_tensors_factory(
+            ["hidden_states", "residual"], config.hidden_size
+        )
+
+        if get_pp_group().is_last_rank:
+            self.norm = Qwen3_5RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        else:
+            self.norm = PPMissingLayer()
+
+        self.aux_hidden_state_layers: tuple[int, ...] = ()
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.embed_tokens(input_ids)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor | None,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+    ) -> torch.Tensor | IntermediateTensors | tuple[torch.Tensor, list[torch.Tensor]]:
+        if get_pp_group().is_first_rank:
+            if inputs_embeds is not None:
+                hidden_states = inputs_embeds
+            else:
+                hidden_states = self.embed_input_ids(input_ids)
+            residual = None
+        else:
+            assert intermediate_tensors is not None
+            hidden_states = intermediate_tensors["hidden_states"]
+            residual = intermediate_tensors["residual"]
+
+        aux_hidden_states = self._maybe_add_hidden_state([], 0, hidden_states, residual)
+        for layer_idx, layer in enumerate(
+            islice(self.layers, self.start_layer, self.end_layer),
+            start=self.start_layer,
+        ):
+            hidden_states, residual = layer(
+                positions=positions,
+                hidden_states=hidden_states,
+                residual=residual,
+            )
+            self._maybe_add_hidden_state(
+                aux_hidden_states, layer_idx + 1, hidden_states, residual
+            )
+
+        if not get_pp_group().is_last_rank:
+            return IntermediateTensors(
+                {"hidden_states": hidden_states, "residual": residual}
+            )
+        hidden_states, _ = self.norm(hidden_states, residual)
+        if aux_hidden_states:
+            return hidden_states, aux_hidden_states
+        return hidden_states
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return SharedFusedMoE.make_expert_params_mapping(
+            self,
+            ckpt_gate_proj_name="gate_proj",
+            ckpt_down_proj_name="down_proj",
+            ckpt_up_proj_name="up_proj",
+            num_experts=getattr(self.config, "num_experts", 0),
+            num_redundant_experts=self.num_redundant_experts,
+        )
+
+    def load_fused_expert_weights(
+        self,
+        name: str,
+        params_dict: dict,
+        loaded_weight: torch.Tensor,
+        shard_id: str,
+        num_experts: int,
+    ) -> bool:
+        param = params_dict[name]
+        weight_loader = typing.cast(Callable[..., bool], param.weight_loader)
+        loaded_local_expert = False
+        for expert_id in range(num_experts):
+            curr_expert_weight = loaded_weight[expert_id]
+            success = weight_loader(
+                param,
+                curr_expert_weight,
+                name,
+                shard_id,
+                expert_id,
+                return_success=True,
+            )
+            if success:
+                loaded_local_expert = True
+
+        return loaded_local_expert
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            # self attention
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            # mlp
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+            ("in_proj_ba", "in_proj_b", 0),
+            ("in_proj_ba", "in_proj_a", 1),
+        ]
+
+        if self.enable_lora:
+            stacked_params_mapping.extend(
+                [
+                    ("in_proj_qkv", "in_proj_qkv", (0, 1, 2)),
+                    ("in_proj_z", "in_proj_z", 0),
+                ]
+            )
+        else:
+            stacked_params_mapping.extend(
+                [
+                    ("in_proj_qkvz", "in_proj_qkv", (0, 1, 2)),
+                    ("in_proj_qkvz", "in_proj_z", 3),
+                ]
+            )
+
+        params_dict = dict(self.named_parameters())
+        loaded_params: set[str] = set()
+        expert_params_mapping = self.get_expert_mapping()
+        is_fused_expert = False
+        fused_expert_params_mapping = [
+            ("experts.w13_weight", "experts.gate_up_proj", 0, "w1"),
+            ("experts.w2_weight", "experts.down_proj", 0, "w2"),
+        ]
+        num_experts = (
+            self.config.num_experts if hasattr(self.config, "num_experts") else 0
+        )
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+
+            if name.startswith("mtp."):
+                continue
+
+            # Remapping the name of FP8 kv-scale.
+            if name.endswith("scale"):
+                name = maybe_remap_kv_scale_name(name, params_dict)
+                if name is None:
+                    continue
+
+            for param_name, weight_name, shard_id in stacked_params_mapping:
+                if "experts.gate_up_proj" in name or "experts.down_proj" in name:
+                    is_fused_expert = True
+                    expert_params_mapping = fused_expert_params_mapping
+
+                if weight_name not in name:
+                    continue
+
+                if "mlp.experts" in name:
+                    continue
+
+                name = name.replace(weight_name, param_name)
+                # Skip loading extra bias for GPTQ models.
+                if name.endswith(".bias") and name not in params_dict:
+                    continue
+                # Skip layers on other devices.
+                if is_pp_missing_parameter(name, self):
+                    continue
+                # name = apply_attn_prefix(name, params_dict)
+                if name not in params_dict:
+                    continue
+                param = params_dict[name]
+                weight_loader = param.weight_loader
+                if param_name == "in_proj_z" and self.enable_lora:
+                    weight_loader(param, loaded_weight)
+                else:
+                    weight_loader(param, loaded_weight, shard_id)
+                break
+            else:
+                is_expert_weight = False
+                for mapping in expert_params_mapping:
+                    param_name, weight_name, expert_id, shard_id = mapping
+                    if weight_name not in name:
+                        continue
+                    is_expert_weight = True
+                    name_mapped = name.replace(weight_name, param_name)
+                    # Skip layers on other devices.
+                    if is_pp_missing_parameter(name_mapped, self):
+                        continue
+                    if is_fused_expert:
+                        # qwen3.5 no need to transpose
+                        # loaded_weight = loaded_weight.transpose(-1, -2)
+                        if "experts.gate_up_proj" in name:
+                            loaded_weight = loaded_weight.chunk(2, dim=-2)
+                            success_w1 = self.load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight[0],
+                                "w1",
+                                num_experts,
+                            )
+                            success_w3 = self.load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight[1],
+                                "w3",
+                                num_experts,
+                            )
+                            success = success_w1 and success_w3
+                        else:
+                            # down_proj
+                            success = self.load_fused_expert_weights(
+                                name_mapped,
+                                params_dict,
+                                loaded_weight,
+                                shard_id,
+                                num_experts,
+                            )
+                        if success:
+                            name = name_mapped
+                            break
+                    else:
+                        # Skip loading extra bias for GPTQ models.
+                        if (
+                            name_mapped.endswith(".bias")
+                            or name_mapped.endswith("_bias")
+                        ) and name_mapped not in params_dict:
+                            continue
+                        param = params_dict[name_mapped]
+                        weight_loader = param.weight_loader
+                        success = weight_loader(
+                            param,
+                            loaded_weight,
+                            name_mapped,
+                            shard_id=shard_id,
+                            expert_id=expert_id,
+                            return_success=True,
+                        )
+                    if success:
+                        name = name_mapped
+                        break
+                else:
+                    if is_expert_weight:
+                        # We've checked that this is an expert weight
+                        # However it's not mapped locally to this rank
+                        # So we simply skip it
+                        continue
+                    # Skip loading extra bias for GPTQ models.
+                    if name.endswith(".bias") and name not in params_dict:
+                        continue
+                    if is_pp_missing_parameter(name, self):
+                        continue
+                    if name not in params_dict:
+                        logger.warning_once(
+                            f"Parameter {name} not found in params_dict, skip loading"
+                        )
+                        continue
+                    param = params_dict[name]
+                    weight_loader = getattr(
+                        param, "weight_loader", default_weight_loader
+                    )
+                    weight_loader(param, loaded_weight)
+            loaded_params.add(name)
+        return loaded_params
+
+
+class Qwen3_5ForCausalLMBase(
+    nn.Module,
+    HasInnerState,
+    IsHybrid,
+    SupportsEagle3,
+    SupportsLoRA,
+    SupportsPP,
+):
+    packed_modules_mapping = {
+        "qkv_proj": [
+            "q_proj",
+            "k_proj",
+            "v_proj",
+        ],
+        "gate_up_proj": ["gate_proj", "up_proj"],
+        # GDN fused projections.
+        "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
+        "in_proj_ba": ["in_proj_b", "in_proj_a"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        config = vllm_config.model_config.hf_text_config
+        self.vllm_config = vllm_config
+        self.model_config = vllm_config.model_config
+        cache_config = vllm_config.cache_config
+
+        scheduler_config = vllm_config.scheduler_config
+        if cache_config.mamba_cache_mode == "all":
+            raise NotImplementedError(
+                "Qwen3.5 currently does not support 'all' prefix caching, "
+                "please use '--mamba-cache-mode=align' instead"
+            )
+        self.quant_config = vllm_config.quant_config
+
+        super().__init__()
+        self.config = config
+        self.scheduler_config = scheduler_config
+        self.model = Qwen3_5Model(
+            vllm_config=vllm_config, prefix=maybe_prefix(prefix, "model")
+        )
+
+        # When LoRA is enabled, GDN uses separate in_proj_qkv and in_proj_z
+        # instead of merged in_proj_qkvz; pack mapping must match.
+        if vllm_config.lora_config:
+            base = getattr(Qwen3_5ForCausalLMBase, "packed_modules_mapping", {})
+            self.packed_modules_mapping = {k: list(v) for k, v in base.items()}
+            self.packed_modules_mapping.pop("in_proj_qkvz", None)
+            self.packed_modules_mapping["in_proj_qkv"] = ["in_proj_qkv"]
+            self.packed_modules_mapping["in_proj_z"] = ["in_proj_z"]
+
+        if get_pp_group().is_last_rank:
+            if config.tie_word_embeddings:
+                self.lm_head = self.model.embed_tokens
+            else:
+                self.lm_head = ParallelLMHead(
+                    config.vocab_size,
+                    config.hidden_size,
+                    prefix=maybe_prefix(prefix, "lm_head"),
+                )
+        else:
+            self.lm_head = PPMissingLayer()
+
+        self.logits_processor = LogitsProcessor(config.vocab_size)
+        self.make_empty_intermediate_tensors = (
+            self.model.make_empty_intermediate_tensors
+        )
+
+    def embed_input_ids(self, input_ids: torch.Tensor) -> torch.Tensor:
+        return self.model.embed_input_ids(input_ids)
+
+    def set_aux_hidden_state_layers(self, layers: tuple[int, ...]) -> None:
+        self.model.aux_hidden_state_layers = layers
+
+    def get_eagle3_aux_hidden_state_layers(self) -> tuple[int, ...]:
+        num_layers = len(self.model.layers)
+        return (2, num_layers // 2, num_layers - 3)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ):
+        hidden_states = self.model(
+            input_ids, positions, intermediate_tensors, inputs_embeds
+        )
+
+        return hidden_states
+
+    def compute_logits(
+        self,
+        hidden_states: torch.Tensor,
+    ) -> torch.Tensor | None:
+        return self.logits_processor(self.lm_head, hidden_states)
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=["mtp."],
+        )
+        return loader.load_weights(weights)
+
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+        return MambaStateDtypeCalculator.gated_delta_net_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls, vllm_config: "VllmConfig"
+    ) -> tuple[tuple[int, int], tuple[int, int]]:
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_text_config
+        tp_size = parallel_config.tensor_parallel_size
+        num_spec = (
+            vllm_config.speculative_config.num_speculative_tokens
+            if vllm_config.speculative_config
+            else 0
+        )
+        return MambaStateShapeCalculator.gated_delta_net_state_shape(
+            tp_size,
+            hf_config.linear_num_key_heads,
+            hf_config.linear_num_value_heads,
+            hf_config.linear_key_head_dim,
+            hf_config.linear_value_head_dim,
+            hf_config.linear_conv_kernel_dim,
+            num_spec,
+        )
+
+    @classmethod
+    def get_mamba_state_copy_func(
+        cls,
+    ) -> tuple[MambaStateCopyFunc, MambaStateCopyFunc]:
+        return MambaStateCopyFuncCalculator.gated_delta_net_state_copy_func()
+
+
+class Qwen3_5ForCausalLM(Qwen3_5ForCausalLMBase):
+    pass
+
+
+class Qwen3_5MoeForCausalLM(Qwen3_5ForCausalLMBase, QwenNextMixtureOfExperts):
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
+        super().__init__(vllm_config=vllm_config, prefix=prefix)
+
+        # set MoE hyperparameters
+        self.set_moe_parameters()
+
+    def get_expert_mapping(self) -> list[tuple[str, str, int, str]]:
+        return self.model.get_expert_mapping()
+
+
+########################################################
+# Qwen3_5-Dense
+########################################################
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen3VLMultiModalProcessor,
+    info=Qwen3_5ProcessingInfo,
+    dummy_inputs=Qwen3VLDummyInputsBuilder,
+)
+class Qwen3_5ForConditionalGeneration(Qwen3VLForConditionalGeneration, IsHybrid):
+    # Qwen3.5 does not support multimodal pruning (EVS).
+    supports_multimodal_pruning = False
+
+    packed_modules_mapping = Qwen3VLForConditionalGeneration.packed_modules_mapping | {
+        "in_proj_qkvz": ["in_proj_qkv", "in_proj_z"],
+        "in_proj_ba": ["in_proj_b", "in_proj_a"],
+    }
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
+        # protocols have not __init__ method, so we need to use nn.Module.__init__
+        nn.Module.__init__(self)
+        self.update_packed_mapping(enable_lora=vllm_config.lora_config is not None)
+        config: Qwen3_5Config = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
+        # Qwen3.5 does not support multimodal pruning (EVS).
+        self.is_multimodal_pruning_enabled = False
+
+        with self._mark_tower_model(vllm_config, {"image", "video"}):
+            self.visual = Qwen3_VisionTransformer(
+                config.vision_config,
+                norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "visual"),
+            )
+
+        with self._mark_language_model(vllm_config):
+            self.language_model = Qwen3_5ForCausalLM(
+                vllm_config=vllm_config, prefix=maybe_prefix(prefix, "language_model")
+            )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+    def update_packed_mapping(self, enable_lora: bool):
+        # When LoRA is enabled, GDN uses separate in_proj_qkv and in_proj_z
+        if enable_lora:
+            base = getattr(
+                Qwen3_5ForConditionalGeneration, "packed_modules_mapping", {}
+            )
+            self.packed_modules_mapping = {k: list(v) for k, v in base.items()}
+            self.packed_modules_mapping.pop("in_proj_qkvz", None)
+            self.packed_modules_mapping["in_proj_qkv"] = ["in_proj_qkv"]
+            self.packed_modules_mapping["in_proj_z"] = ["in_proj_z"]
+
+    def embed_input_ids(
+        self,
+        input_ids: torch.Tensor,
+        multimodal_embeddings: MultiModalEmbeddings | None = None,
+        *,
+        is_multimodal: torch.Tensor | None = None,
+    ) -> torch.Tensor:
+        inputs_embeds = self._embed_text_input_ids(
+            input_ids,
+            self.language_model.embed_input_ids,
+            is_multimodal=is_multimodal,
+        )
+
+        if multimodal_embeddings is None or len(multimodal_embeddings) == 0:
+            return inputs_embeds
+
+        is_multimodal = _require_is_multimodal(is_multimodal)
+
+        inputs_embeds = _merge_multimodal_embeddings(
+            inputs_embeds=inputs_embeds,
+            multimodal_embeddings=multimodal_embeddings,
+            is_multimodal=is_multimodal,
+        )
+
+        return inputs_embeds
+
+    def recompute_mrope_positions(self, *args, **kwargs):
+        raise NotImplementedError(
+            "Qwen3.5 does not support multimodal pruning (EVS). "
+            "recompute_mrope_positions should never be called."
+        )
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        intermediate_tensors: IntermediateTensors | None = None,
+        inputs_embeds: torch.Tensor | None = None,
+        **kwargs: object,
+    ) -> torch.Tensor | IntermediateTensors:
+        """Run forward pass for Qwen3.5.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            positions: Flattened (concatenated) position ids corresponding to a
+                batch.
+                **NOTE**: If mrope is enabled (default setting for Qwen3VL
+                opensource models), the shape will be `(3, seq_len)`,
+                otherwise it will be `(seq_len,).
+            intermediate_tensors: Intermediate tensors from previous pipeline
+                stages.
+            inputs_embeds: Pre-computed input embeddings.
+            **kwargs: Additional keyword arguments including:
+                - pixel_values: Pixel values to be fed to a model.
+                    `None` if no images are passed.
+                - image_grid_thw: Tensor `(n_images, 3)` of image 3D grid in
+                    LLM. `None` if no images are passed.
+                - pixel_values_videos: Pixel values of videos to be fed to a
+                    model. `None` if no videos are passed.
+                - video_grid_thw: Tensor `(n_videos, 3)` of video 3D grid in
+                    LLM. `None` if no videos are passed.
+        """
+
+        if intermediate_tensors is not None:
+            inputs_embeds = None
+
+        hidden_states = self.language_model.model(
+            input_ids=input_ids,
+            positions=positions,
+            intermediate_tensors=intermediate_tensors,
+            inputs_embeds=inputs_embeds,
+        )
+
+        return hidden_states
+
+    def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
+        loader = AutoWeightsLoader(
+            self,
+            skip_prefixes=["mtp."],
+        )
+        return loader.load_weights(weights, mapper=self.hf_to_vllm_mapper)
+
+    @classmethod
+    def get_mamba_state_dtype_from_config(
+        cls,
+        vllm_config: "VllmConfig",
+    ) -> tuple[torch.dtype, torch.dtype]:
+        return MambaStateDtypeCalculator.gated_delta_net_state_dtype(
+            vllm_config.model_config.dtype,
+            vllm_config.cache_config.mamba_cache_dtype,
+            vllm_config.cache_config.mamba_ssm_cache_dtype,
+        )
+
+    @classmethod
+    def get_mamba_state_shape_from_config(
+        cls, vllm_config: "VllmConfig"
+    ) -> tuple[tuple[int, int], tuple[int, int]]:
+        parallel_config = vllm_config.parallel_config
+        hf_config = vllm_config.model_config.hf_text_config
+        tp_size = parallel_config.tensor_parallel_size
+        num_spec = (
+            vllm_config.speculative_config.num_speculative_tokens
+            if vllm_config.speculative_config
+            else 0
+        )
+        return MambaStateShapeCalculator.gated_delta_net_state_shape(
+            tp_size,
+            hf_config.linear_num_key_heads,
+            hf_config.linear_num_value_heads,
+            hf_config.linear_key_head_dim,
+            hf_config.linear_value_head_dim,
+            hf_config.linear_conv_kernel_dim,
+            num_spec,
+        )
+
+    @classmethod
+    def get_mamba_state_copy_func(cls) -> tuple[MambaStateCopyFunc, MambaStateCopyFunc]:
+        return MambaStateCopyFuncCalculator.gated_delta_net_state_copy_func()
+
+
+########################################################
+# Qwen3_5-MoE
+########################################################
+
+
+class Qwen3_5_MoeMixtureOfExperts(MixtureOfExperts):
+    def update_physical_experts_metadata(
+        self,
+        num_physical_experts: int,
+        num_local_physical_experts: int,
+    ) -> None:
+        assert self.num_local_physical_experts == num_local_physical_experts
+        self.num_physical_experts = num_physical_experts
+        self.num_local_physical_experts = num_local_physical_experts
+        self.num_redundant_experts = num_physical_experts - self.num_logical_experts
+        for layer in self.language_model.model.layers:
+            if isinstance(layer.mlp, Qwen3NextSparseMoeBlock):
+                moe = layer.mlp
+                moe.n_local_physical_experts = num_local_physical_experts
+                moe.n_physical_experts = num_physical_experts
+                moe.n_redundant_experts = self.num_redundant_experts
+                moe.experts.update_expert_map()
+
+    def set_moe_parameters(self):
+        self.expert_weights = []
+
+        self.moe_layers = []
+        example_moe = None
+        for layer in self.language_model.model.layers:
+            if isinstance(layer, Qwen3_5DecoderLayer) and isinstance(
+                layer.mlp, Qwen3NextSparseMoeBlock
+            ):
+                example_moe = layer.mlp
+                self.moe_layers.append(layer.mlp.experts)
+
+        if example_moe is None:
+            raise RuntimeError(
+                "No Qwen3_5 layer found in the language_model.model.layers."
+            )
+
+        # Set MoE hyperparameters
+        self.num_moe_layers = len(self.moe_layers)
+        self.num_expert_groups = 1
+        self.num_shared_experts = 0
+        self.num_logical_experts = example_moe.n_logical_experts
+        self.num_physical_experts = example_moe.n_physical_experts
+        self.num_local_physical_experts = example_moe.n_local_physical_experts
+        self.num_routed_experts = example_moe.n_routed_experts
+        self.num_redundant_experts = example_moe.n_redundant_experts
+
+
+@MULTIMODAL_REGISTRY.register_processor(
+    Qwen3VLMultiModalProcessor,
+    info=Qwen3_5MoeProcessingInfo,
+    dummy_inputs=Qwen3VLDummyInputsBuilder,
+)
+class Qwen3_5MoeForConditionalGeneration(
+    Qwen3_5ForConditionalGeneration, Qwen3_5_MoeMixtureOfExperts
+):
+    # For MoE LoRA weights loading
+    is_3d_moe_weight: bool = True
+
+    def __init__(self, *, vllm_config: VllmConfig, prefix: str = "model"):
+        # protocols have not __init__ method, so we need to use nn.Module.__init__
+        nn.Module.__init__(self)
+        self.update_packed_mapping(enable_lora=vllm_config.lora_config is not None)
+        config: Qwen3_5MoeConfig = vllm_config.model_config.hf_config
+        quant_config = vllm_config.quant_config
+        multimodal_config = vllm_config.model_config.multimodal_config
+
+        self.config = config
+        self.multimodal_config = multimodal_config
+        self.use_data_parallel = multimodal_config.mm_encoder_tp_mode == "data"
+        # Qwen3.5 does not support multimodal pruning (EVS).
+        self.is_multimodal_pruning_enabled = False
+
+        with self._mark_tower_model(vllm_config, {"image", "video"}):
+            self.visual = Qwen3_VisionTransformer(
+                config.vision_config,
+                norm_eps=getattr(config, "rms_norm_eps", 1e-6),
+                quant_config=quant_config,
+                prefix=maybe_prefix(prefix, "visual"),
+            )
+
+        with self._mark_language_model(vllm_config):
+            self.language_model = Qwen3_5MoeForCausalLM(
+                vllm_config=vllm_config, prefix=maybe_prefix(prefix, "language_model")
+            )
+
+        self.make_empty_intermediate_tensors = (
+            self.language_model.make_empty_intermediate_tensors
+        )
+
+        # set MoE hyperparameters
+        self.set_moe_parameters()
diff --git a/vllm/v1/attention/backends/cute_paged/_backend.py b/vllm/v1/attention/backends/cute_paged/_backend.py
index 0903a9756015..7d4b29327ab7 100644
--- a/vllm/v1/attention/backends/cute_paged/_backend.py
+++ b/vllm/v1/attention/backends/cute_paged/_backend.py
@@ -1,5 +1,6 @@
 # Copyright 2026 Navi Ai Labs
 # SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 """CuTe DSL paged attention backend classes for SM120/SM121 (GB10).
 
 Custom attention kernel using CuTe Python DSL with FP8 MMA for QK,
@@ -8,8 +9,10 @@
 
 See: docs/superpowers/specs/2026-04-10-cute-paged-attention-design.md
 """
+
 from __future__ import annotations
 
+import os
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, ClassVar
 
@@ -34,11 +37,15 @@
 
 logger = init_logger(__name__)
 
+# Set CUTE_DEBUG_FUSION=1 to enable per-call diff vs Python-dequant W_O ref.
+_DEBUG_FUSION = os.environ.get("CUTE_DEBUG_FUSION", "0") == "1"
+
 
 # ---------------------------------------------------------------------------
 # Metadata
 # ---------------------------------------------------------------------------
 
+
 @dataclass
 class CutePagedMetadata(AttentionMetadata):
     """Per-batch metadata for CuTe paged attention."""
@@ -53,13 +60,13 @@ class CutePagedMetadata(AttentionMetadata):
     num_prefill_tokens: int
 
     # Sequence info
-    seq_lens: torch.Tensor          # [num_seqs] int32 on device
-    query_start_loc: torch.Tensor   # [num_seqs + 1] int32 on device
+    seq_lens: torch.Tensor  # [num_seqs] int32 on device
+    query_start_loc: torch.Tensor  # [num_seqs + 1] int32 on device
     max_query_len: int
     max_seq_len: int
 
     # Page table
-    block_table: torch.Tensor       # [num_seqs, max_blocks_per_seq] int32
+    block_table: torch.Tensor  # [num_seqs, max_blocks_per_seq] int32
 
     # Flags
     is_decode_only: bool
@@ -69,6 +76,7 @@ class CutePagedMetadata(AttentionMetadata):
 # Backend
 # ---------------------------------------------------------------------------
 
+
 class CutePagedBackend(AttentionBackend):
     """CuTe DSL paged attention backend for SM120/SM121."""
 
@@ -77,7 +85,8 @@ class CutePagedBackend(AttentionBackend):
 
     supported_dtypes: ClassVar[list[torch.dtype]] = [torch.bfloat16]
     supported_kv_cache_dtypes: ClassVar[list[CacheDType]] = [
-        "fp8", "fp8_e4m3",
+        "fp8",
+        "fp8_e4m3",
     ]
 
     @staticmethod
@@ -98,13 +107,15 @@ def supports_head_size(cls, head_size: int) -> bool:
 
     @classmethod
     def supports_compute_capability(
-        cls, capability: DeviceCapability,
+        cls,
+        capability: DeviceCapability,
     ) -> bool:
         return capability.major == 12
 
     @classmethod
     def supports_kv_cache_dtype(
-        cls, kv_cache_dtype: CacheDType | None,
+        cls,
+        kv_cache_dtype: CacheDType | None,
     ) -> bool:
         return kv_cache_dtype in ("fp8", "fp8_e4m3")
 
@@ -146,6 +157,7 @@ def get_kv_cache_stride_order(
 # Attention Implementation
 # ---------------------------------------------------------------------------
 
+
 class CutePagedAttentionImpl(AttentionImpl[CutePagedMetadata]):
     """CuTe DSL paged attention forward pass."""
 
@@ -165,13 +177,9 @@ def __init__(
         kv_sharing_target_layer_name: str | None = None,
     ) -> None:
         if sliding_window is not None:
-            raise ValueError(
-                "CutePagedAttention does not support sliding window"
-            )
+            raise ValueError("CutePagedAttention does not support sliding window")
         if logits_soft_cap is not None:
-            raise ValueError(
-                "CutePagedAttention does not support logits_soft_cap"
-            )
+            raise ValueError("CutePagedAttention does not support logits_soft_cap")
         if attn_type != AttentionType.DECODER:
             raise ValueError(
                 f"CutePagedAttention only supports DECODER, got {attn_type}"
@@ -191,26 +199,19 @@ def __init__(
         logger.info(
             "CutePagedAttention initialized: %d Q heads, %d KV heads, "
             "head_dim=%d, GQA ratio=%d",
-            self.num_heads, self.num_kv_heads,
-            self.head_size, self.num_queries_per_kv,
+            self.num_heads,
+            self.num_kv_heads,
+            self.head_size,
+            self.num_queries_per_kv,
         )
 
+        # Fusion state is owned exclusively by this impl (spec § Impl side).
+        # Buffers are allocated later in attach_fusion() with sizes passed
+        # by the model, NOT read from get_current_vllm_config() — avoids the
+        # hf_config vs hf_text_config fragility (code-review I1).
         self._fusion_bound = False
         self._fusion_active = False
-
-        # Pre-allocate fusion buffers during init so they don't
-        # interfere with vLLM V1's memory pool during forward.
-        # Uses vllm_config to get max_num_seqs and hidden_dim.
-        try:
-            from vllm.config import get_current_vllm_config
-            cfg = get_current_vllm_config()
-            max_num_seqs = cfg.scheduler_config.max_num_seqs
-            hidden_dim = cfg.model_config.hf_config.hidden_size
-            q_size = self.num_heads * self.head_size
-            self._preallocate_fusion_buffers(
-                max_num_seqs, hidden_dim, q_size, "cuda")
-        except Exception:
-            pass  # Will allocate lazily in bind_fusion_weights
+        self._fusion_attached = False  # set by attach_fusion
 
     def _preallocate_fusion_buffers(
         self,
@@ -225,68 +226,169 @@ def _preallocate_fusion_buffers(
         interfere with vLLM V1's pre-allocated memory pool.
         """
         self.wo_output = torch.zeros(
-            max_num_seqs, hidden_dim, dtype=torch.float32, device=device)
+            max_num_seqs, hidden_dim, dtype=torch.float32, device=device
+        )
         self.rmsnorm_output = torch.empty(
-            max_num_seqs, hidden_dim, dtype=torch.bfloat16, device=device)
+            max_num_seqs, hidden_dim, dtype=torch.bfloat16, device=device
+        )
         self.residual_output = torch.empty(
-            max_num_seqs, hidden_dim, dtype=torch.bfloat16, device=device)
-        self.arrival_count = torch.zeros(
-            max_num_seqs, dtype=torch.int32, device=device)
+            max_num_seqs, hidden_dim, dtype=torch.bfloat16, device=device
+        )
+        self.arrival_count = torch.zeros(max_num_seqs, dtype=torch.int32, device=device)
         self.gate_buf = torch.empty(
-            max_num_seqs, q_size, dtype=torch.bfloat16, device=device)
+            max_num_seqs, q_size, dtype=torch.bfloat16, device=device
+        )
         self.residual_buf = torch.empty(
-            max_num_seqs, hidden_dim, dtype=torch.bfloat16, device=device)
+            max_num_seqs, hidden_dim, dtype=torch.bfloat16, device=device
+        )
 
-    def bind_fusion_weights(
-        self,
-        wo_weight: torch.Tensor,
-        wo_scales: torch.Tensor,
-        wo_global_scale: torch.Tensor,
-        rmsnorm_gamma: torch.Tensor,
-        rmsnorm_eps: float,
-        max_num_seqs: int,
-    ) -> None:
-        """Bind static fusion weights and allocate persistent I/O buffers.
-
-        Called once from the model layer after weight loading. Replaces
-        the per-forward side-channel set/clear pattern. All buffer
-        addresses are stable — safe for CUDA graph capture and replay.
-
-        Args:
-            wo_weight: NVFP4 packed weights [N, K/2] uint8
-            wo_scales: Per-block scales [N, K_sf] fp8
-            wo_global_scale: Scalar scale [1] fp32 (kernel reads via ld.global)
-            rmsnorm_gamma: LayerNorm weight [hidden_dim] bf16
-            rmsnorm_eps: LayerNorm epsilon (e.g. 1e-6)
-            max_num_seqs: Maximum batch size for buffer allocation
+    def attach_fusion(self, parent_layer: torch.nn.Module) -> None:
+        """Declare fusion intent. Called once per layer from the model
+        `__init__` (see `vllm/nvllm/models/qwen3_5.py:Qwen3_5DecoderLayer`).
+
+        Stores MODULE refs (not tensor refs) to o_proj and
+        post_attention_layernorm — NVFP4's `process_weights_after_loading`
+        REPLACES `weight_global_scale` with a new Parameter, so any tensor
+        captured here would go stale (code-review C1).
+
+        Pre-allocates persistent fusion buffers synchronously from sizes
+        read off parent_layer. This replaces the old
+        `get_current_vllm_config()` fallback that could silently defer
+        allocation past CUDA-graph capture (code-review I1).
         """
-        # Static weights (bound once, never change)
-        self.wo_weight = wo_weight
-        self.wo_scales = wo_scales
-        self.wo_global_scale = wo_global_scale
-        self.rmsnorm_gamma = rmsnorm_gamma
-        self.rmsnorm_eps = rmsnorm_eps
-
-        hidden_dim = rmsnorm_gamma.shape[0]
-        q_size = self.num_heads * self.head_size  # num_heads * head_dim
-
-        # Persistent I/O buffers are pre-allocated during __init__ via
-        # _preallocate_fusion_buffers() so they don't interfere with
-        # vLLM V1's memory pool during the first forward pass.
-        # If not yet allocated (e.g. __init__ didn't have config), do it now.
-        if not hasattr(self, 'wo_output'):
-            self._preallocate_fusion_buffers(
-                max_num_seqs, hidden_dim, q_size, wo_weight.device)
+        # MTP opt-out (spec "MTP handling"; code-review G3). MTP draft
+        # layers run with different batch shapes, and the fused kernel's
+        # layout assumptions aren't verified for the spec-decode path.
+        prefix = getattr(parent_layer, "prefix", "")
+        if "mtp" in prefix:
+            logger.debug("CuTe fusion: skipping MTP layer %s", prefix or "<no-prefix>")
+            return
 
-        self._fusion_bound = True
+        # Resolve sizes explicitly — no reliance on hf_config attr name.
+        self_attn = parent_layer.self_attn
+        q_size = self_attn.num_heads * self_attn.head_dim
+        hidden_dim = self_attn.hidden_size
+
+        try:
+            from vllm.config import get_current_vllm_config
+
+            cfg = get_current_vllm_config()
+            max_num_seqs = cfg.scheduler_config.max_num_seqs
+        except Exception as e:
+            logger.error(
+                "CuTe fusion: attach_fusion cannot resolve max_num_seqs; "
+                "fusion disabled for layer %s. Error: %s",
+                prefix,
+                e,
+            )
+            return
+
+        # Store module refs, NOT tensor refs.
+        self._o_proj_module = self_attn.o_proj
+        self._post_norm_module = parent_layer.post_attention_layernorm
+        self._attn_output_gate = bool(self_attn.attn_output_gate)
+        self._fusion_prefix = prefix
+        self._fusion_max_num_seqs = max_num_seqs
+        self._fusion_hidden_dim = hidden_dim
+        self._fusion_q_size = q_size
+
+        # Allocate buffers ONCE. Subsequent attach calls (should not happen
+        # under single-instantiation, but defensive) are no-ops for
+        # buffer allocation so CUDA-graph pointers stay stable (H3).
+        if not hasattr(self, "wo_output"):
+            self._preallocate_fusion_buffers(max_num_seqs, hidden_dim, q_size, "cuda")
+
+        self._fusion_attached = True
+        logger.info(
+            "CuTe fusion attached: layer=%s max_num_seqs=%d hidden_dim=%d "
+            "q_size=%d attn_output_gate=%s",
+            prefix,
+            max_num_seqs,
+            hidden_dim,
+            q_size,
+            self._attn_output_gate,
+        )
+
+    def _resolve_fusion_weights(self) -> None:
+        """Bind current NVFP4 weight tensors off the stored o_proj / post_norm
+        module refs. Called from `process_weights_after_loading` on EVERY
+        invocation — supports live weight reload at
+        `vllm/model_executor/model_loader/reload/layerwise.py:215-284`
+        (code-review C2).
+
+        No short-circuit on `_fusion_bound=True`. Overwrites strong refs so
+        the next forward reads the NEW Parameter identity NVFP4 installed.
+        """
+        if not getattr(self, "_fusion_attached", False):
+            # attach_fusion() was never called (MTP, BF16, non-full-attention,
+            # or attach_fusion hit an early return).
+            return
+
+        o_proj = self._o_proj_module
+        post_norm = self._post_norm_module
+
+        # The "is this NVFP4?" gate — matches current behavior at
+        # `vllm/model_executor/models/qwen3_next.py:484` (code-review H2).
+        # A BF16 / FP8 serve lacks weight_global_scale — skip silently.
+        if not hasattr(o_proj, "weight_global_scale"):
+            logger.warning(
+                "CuTe fusion: o_proj weights not NVFP4 (or not loaded) for "
+                "layer %s; fusion disabled this call.",
+                self._fusion_prefix,
+            )
+            self._fusion_bound = False
+            return
 
+        # Read tensor refs FRESH every call (code-review C1, C2).
+        self.wo_weight = o_proj.weight
+        self.wo_scales = o_proj.weight_scale
+        self.wo_global_scale = o_proj.weight_global_scale
+        self.rmsnorm_gamma = post_norm.weight
+        self.rmsnorm_eps = post_norm.variance_epsilon
+
+        self._fusion_bound = True
         logger.info(
-            "CuTe fusion bound: hidden_dim=%d, q_size=%d, max_seqs=%d, "
-            "wo_weight=%s, rmsnorm_gamma=%s",
-            hidden_dim, q_size, max_num_seqs,
-            list(wo_weight.shape), list(rmsnorm_gamma.shape),
+            "CuTe fusion resolved: layer=%s wo_weight=%s rmsnorm_gamma=%s",
+            self._fusion_prefix,
+            list(self.wo_weight.shape),
+            list(self.rmsnorm_gamma.shape),
         )
 
+    # --- DISABLED 2026-04-17 (Phase B own-the-stack refactor) ---
+    # Replaced by `attach_fusion(parent_layer)` + `_resolve_fusion_weights()`.
+    # Kept commented (not deleted) until Tier-3 GSM8K 8/8 validates the new
+    # path. Remove in a follow-up commit once the refactor is proven.
+    # --- DISABLED block start ---
+    # def bind_fusion_weights(
+    #     self,
+    #     wo_weight: torch.Tensor,
+    #     wo_scales: torch.Tensor,
+    #     wo_global_scale: torch.Tensor,
+    #     rmsnorm_gamma: torch.Tensor,
+    #     rmsnorm_eps: float,
+    #     max_num_seqs: int,
+    # ) -> None:
+    #     """Bind static fusion weights and allocate persistent I/O buffers."""
+    #     self.wo_weight = wo_weight
+    #     self.wo_scales = wo_scales
+    #     self.wo_global_scale = wo_global_scale
+    #     self.rmsnorm_gamma = rmsnorm_gamma
+    #     self.rmsnorm_eps = rmsnorm_eps
+    #     hidden_dim = rmsnorm_gamma.shape[0]
+    #     q_size = self.num_heads * self.head_size
+    #     if not hasattr(self, "wo_output"):
+    #         self._preallocate_fusion_buffers(
+    #             max_num_seqs, hidden_dim, q_size, wo_weight.device
+    #         )
+    #     self._fusion_bound = True
+    #     logger.info(
+    #         "CuTe fusion bound: hidden_dim=%d, q_size=%d, max_seqs=%d, "
+    #         "wo_weight=%s, rmsnorm_gamma=%s",
+    #         hidden_dim, q_size, max_num_seqs,
+    #         list(wo_weight.shape), list(rmsnorm_gamma.shape),
+    #     )
+    # --- DISABLED block end ---
+
     def forward(
         self,
         layer: torch.nn.Module,
@@ -307,10 +409,24 @@ def forward(
         k_scale = getattr(layer, "_k_scale_float", 1.0)
         v_scale = getattr(layer, "_v_scale_float", 1.0)
 
-        # Fusion requires both: weights bound AND model layer opted in.
-        # _fusion_bound = weights/buffers allocated (set once at init).
-        # _fusion_active = model layer says "this forward is fused" (per-call).
-        use_fusion = self._fusion_bound and self._fusion_active
+        # Per-forward gating lives entirely inside impl (spec § Per-forward
+        # gating). Fusion activates only for decode batches whose
+        # num_actual_tokens fits the pre-allocated buffers — prevents
+        # out-of-range writes if an unusually large decode batch arrives
+        # (code-review A3).
+        num_actual_tokens = attn_metadata.num_actual_tokens
+        is_decode_only = getattr(attn_metadata, "is_decode_only", False)
+        fits_buffer = num_actual_tokens <= getattr(self, "_fusion_max_num_seqs", 0)
+        self._fusion_active = self._fusion_bound and is_decode_only and fits_buffer
+        use_fusion = self._fusion_active
+        if _DEBUG_FUSION:
+            logger.info(
+                "[CUTE_DEBUG_FUSION] layer=%s bound=%s active=%s use_fusion=%s",
+                getattr(layer, "layer_name", "<layer>"),
+                self._fusion_bound,
+                self._fusion_active,
+                use_fusion,
+            )
         wo_weight = self.wo_weight if use_fusion else None
         wo_scales = self.wo_scales if use_fusion else None
         wo_global_scale = self.wo_global_scale if use_fusion else None
@@ -335,8 +451,6 @@ def forward(
             paged_attention_forward,
         )
 
-        num_actual_tokens = attn_metadata.num_actual_tokens
-
         # For graph-safe dispatch: padded batch size for grid.z
         num_seqs = len(attn_metadata.seq_lens)
         padded_num_seqs = num_seqs  # graph capture overrides via metadata
@@ -365,9 +479,148 @@ def forward(
             padded_num_seqs=padded_num_seqs,
         )
 
+        # --- DEBUG: fusion diagnostic (CUTE_DEBUG_FUSION=1) ---
+        # Compares kernel's impl.wo_output (Phase B GEMV) against a Python
+        # reference computed from the kernel's own Phase A output (`result`)
+        # and a one-time-dequantized W_O. Proves whether Phase B is faithful.
+        if _DEBUG_FUSION and use_fusion:
+            self._debug_fusion_diff(
+                result=result,
+                num_actual_tokens=num_actual_tokens,
+                layer_name=getattr(layer, "layer_name", "<layer>"),
+            )
+        # --- END DEBUG ---
+
         output[:num_actual_tokens].copy_(result)
         return output
 
+    def _debug_fusion_diff(
+        self,
+        result: torch.Tensor,
+        num_actual_tokens: int,
+        layer_name: str,
+    ) -> None:
+        """One-shot per-call diagnostic: compare kernel wo_output to ref."""
+        # Dequant W_O lazily on first call, then cache on self.
+        if not hasattr(self, "_wo_dq_cached"):
+            W = self.wo_weight  # [N, K/2] uint8 NVFP4 packed
+            S_sw = self.wo_scales  # [N, K_sf] fp8_e4m3fn (swizzled!)
+            GS = self.wo_global_scale.item()
+
+            # Invert the CUTLASS swizzle to recover logical [N, K/16] scales.
+            # Our swizzle layout is [M/128, K/4, 32, 4, 4]; inverse permute (0,4,3,1,2).
+            N, K_half = W.shape
+            K = K_half * 2
+            num_k_groups = K // 16
+            num_m_tiles = (N + 127) // 128
+            num_k_tiles = (num_k_groups + 3) // 4
+            if (
+                S_sw.shape[0] == N
+                and S_sw.shape[1] == num_k_groups
+                and num_m_tiles * 128 == N
+                and num_k_tiles * 4 == num_k_groups
+            ):
+                # Swizzled 5D layout: (m_tile, k_tile, m_inner=32, m_mid=4, k_inner=4).
+                # Recover (m_tile, m_mid, m_inner, k_tile, k_inner) so reshape
+                # yields M = m_tile*128 + m_mid*32 + m_inner in C order.
+                S_sw_view = S_sw.view(num_m_tiles, num_k_tiles, 32, 4, 4)
+                S_unswizzled = S_sw_view.permute(0, 3, 2, 1, 4).contiguous()
+                S_unswizzled = S_unswizzled.view(N, num_k_groups).to(torch.float32)
+            else:
+                # Fall back: treat as logical already (diagnostic best-effort).
+                S_unswizzled = S_sw.to(torch.float32).view(N, num_k_groups)
+
+            # FP4 E2M1 LUT (matches kernel _fp4_nibble_to_f32)
+            lut = torch.tensor(
+                [
+                    0.0,
+                    0.5,
+                    1.0,
+                    1.5,
+                    2.0,
+                    3.0,
+                    4.0,
+                    6.0,
+                    -0.0,
+                    -0.5,
+                    -1.0,
+                    -1.5,
+                    -2.0,
+                    -3.0,
+                    -4.0,
+                    -6.0,
+                ],
+                dtype=torch.float32,
+                device=W.device,
+            )
+            low_nib = (W & 0x0F).to(torch.int64)
+            high_nib = ((W >> 4) & 0x0F).to(torch.int64)
+            nib = torch.empty(N, K, dtype=torch.int64, device=W.device)
+            nib[:, 0::2] = low_nib
+            nib[:, 1::2] = high_nib
+            W_fp = lut[nib]
+            sf_expanded = S_unswizzled.repeat_interleave(16, dim=1)
+            self._wo_dq_cached = (W_fp * sf_expanded * GS).contiguous()
+            logger.info(
+                "[CUTE_DEBUG_FUSION] layer=%s cached W_O dq: shape=%s absmax=%.4f",
+                layer_name,
+                list(self._wo_dq_cached.shape),
+                self._wo_dq_cached.abs().max().item(),
+            )
+
+        W_dq = self._wo_dq_cached  # [N, K]
+        nat = int(num_actual_tokens)
+        attn = result[:nat].reshape(nat, -1).float()  # [nat, K]
+        ref = attn @ W_dq.T  # [nat, N]
+
+        kernel_out = self.wo_output[:nat].float()
+        diff = (kernel_out - ref).abs()
+        logger.info(
+            "[CUTE_DEBUG_FUSION] layer=%s nat=%d phaseB  "
+            "ref: absmax=%.4f mean=%.4e  "
+            "kernel: absmax=%.4f mean=%.4e  "
+            "diff: max=%.4f mean=%.4e  close=%s",
+            layer_name,
+            nat,
+            ref.abs().max().item(),
+            ref.mean().item(),
+            kernel_out.abs().max().item(),
+            kernel_out.mean().item(),
+            diff.max().item(),
+            diff.mean().item(),
+            bool(torch.allclose(kernel_out, ref, rtol=1e-2, atol=1e-2)),
+        )
+
+        # --- Phase C reference: residual add + RMSNorm ---
+        residual_in = self.residual_buf[:nat].float()  # BF16 → F32
+        new_residual_ref = residual_in + kernel_out  # f32
+        gamma = self.rmsnorm_gamma.float()
+        eps = float(self.rmsnorm_eps)
+        var = new_residual_ref.pow(2).mean(dim=-1, keepdim=True)
+        inv_rms = torch.rsqrt(var + eps)
+        hidden_ref = new_residual_ref * inv_rms * gamma  # f32
+
+        hidden_kernel = self.rmsnorm_output[:nat].float()
+        res_kernel = self.residual_output[:nat].float()
+        h_diff = (hidden_kernel - hidden_ref).abs()
+        r_diff = (res_kernel - new_residual_ref).abs()
+        logger.info(
+            "[CUTE_DEBUG_FUSION] layer=%s nat=%d phaseC  "
+            "hidden_ref_absmax=%.4f hidden_kernel_absmax=%.4f h_max_diff=%.4f  "
+            "res_ref_absmax=%.4f res_kernel_absmax=%.4f r_max_diff=%.4f  "
+            "close_h=%s close_r=%s",
+            layer_name,
+            nat,
+            hidden_ref.abs().max().item(),
+            hidden_kernel.abs().max().item(),
+            h_diff.max().item(),
+            new_residual_ref.abs().max().item(),
+            res_kernel.abs().max().item(),
+            r_diff.max().item(),
+            bool(torch.allclose(hidden_kernel, hidden_ref, rtol=2e-2, atol=2e-2)),
+            bool(torch.allclose(res_kernel, new_residual_ref, rtol=2e-2, atol=2e-2)),
+        )
+
     def do_kv_cache_update(
         self,
         layer: torch.nn.Module,
@@ -391,13 +644,21 @@ def do_kv_cache_update(
         )
 
     def process_weights_after_loading(self, act_dtype: torch.dtype) -> None:
-        pass
+        """Invoked by vLLM's weight loader for each Attention module AFTER
+        all quant methods have processed weights (swizzle, pad, invert GS).
+        This is the last safe opportunity to bind fusion weights before
+        torch.compile traces the forward pass — and it fires a SECOND time
+        on live weight reload (see `layerwise.py:215-284`), so re-resolving
+        on every call is a correctness requirement (code-review C2).
+        """
+        self._resolve_fusion_weights()
 
 
 # ---------------------------------------------------------------------------
 # Metadata Builder
 # ---------------------------------------------------------------------------
 
+
 class CutePagedMetadataBuilder(
     AttentionMetadataBuilder[CutePagedMetadata],
 ):
@@ -418,7 +679,8 @@ def __init__(
         self.block_size = kv_cache_spec.block_size
         logger.info(
             "CutePagedMetadataBuilder: block_size=%d, layers=%d",
-            self.block_size, len(layer_names),
+            self.block_size,
+            len(layer_names),
         )
 
     def build(
@@ -439,9 +701,7 @@ def build(
 
         # Count prefill vs decode requests
         # Decode: query_len == 1, Prefill: query_len > 1
-        query_lens_cpu = (
-            query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
-        )
+        query_lens_cpu = query_start_loc_cpu[1:] - query_start_loc_cpu[:-1]
         num_decodes = int((query_lens_cpu == 1).sum().item())
         num_prefills = num_reqs - num_decodes
         num_decode_tokens = num_decodes
@@ -463,7 +723,8 @@ def build(
         )
 
     def build_for_cudagraph_capture(
-        self, common_attn_metadata: CommonAttentionMetadata,
+        self,
+        common_attn_metadata: CommonAttentionMetadata,
     ) -> CutePagedMetadata:
         """Override for CUDA graph capture.
 
diff --git a/vllm/v1/attention/backends/cute_paged/kernel.py b/vllm/v1/attention/backends/cute_paged/kernel.py
index 9c3342290b00..0b109866dc03 100644
--- a/vllm/v1/attention/backends/cute_paged/kernel.py
+++ b/vllm/v1/attention/backends/cute_paged/kernel.py
@@ -1709,13 +1709,25 @@ def _kernel(self, query, k_ptr: Int64, v_ptr: Int64,
                 # globally visible before any CTA reads them
                 _threadfence()
 
-                # Arrival counter: atomicAdd 1, last CTA runs Phase C
-                old_count = _atomic_add_u32(
-                    arrival_count_ptr + Int64(seq_idx * Int32(4)),
-                    Int32(1))
+                # Arrival counter: only thread 0 of each CTA bumps it,
+                # then broadcasts "am I in the last-arriving CTA" via
+                # SMEM to all 128 threads. Without this, every thread
+                # atomicAdds (512 bumps per call instead of 4) and only
+                # ONE thread matches old==total-1 → partial Phase C.
+                if tid == Int32(0):
+                    old_count = _atomic_add_u32(
+                        arrival_count_ptr + Int64(seq_idx * Int32(4)),
+                        Int32(1))
+                    if old_count == total_ctas_per_seq - Int32(1):
+                        _st_shared_f32(sync_md, Float32(1.0))
+                    else:
+                        _st_shared_f32(sync_md, Float32(0.0))
+                cute.arch.sync_threads()
+
+                is_last_cta = _ld_shared_f32(sync_md)
 
-                if old_count == total_ctas_per_seq - Int32(1):
-                    # I am the last CTA — all Phase B writes are complete.
+                if is_last_cta > Float32(0.5):
+                    # I am in the last CTA — all Phase B writes are complete.
 
                     # Derive tiling from hidden_dim parameter (NOT hardcoded)
                     hd_c = hidden_dim
diff --git a/vllm/v1/attention/backends/fa_utils.py b/vllm/v1/attention/backends/fa_utils.py
index a4423b301d69..97fc35e70c67 100644
--- a/vllm/v1/attention/backends/fa_utils.py
+++ b/vllm/v1/attention/backends/fa_utils.py
@@ -154,6 +154,17 @@ def get_flash_attn_version(
         return None
 
 
+def is_fa_version_supported(fa_version: int) -> bool:
+    try:
+        from vllm.vllm_flash_attn.flash_attn_interface import (
+            is_fa_version_supported as _is_fa_version_supported,
+        )
+
+        return _is_fa_version_supported(fa_version)
+    except ImportError:
+        return False
+
+
 def flash_attn_supports_fp8() -> bool:
     return (
         get_flash_attn_version() == 3
diff --git a/vllm/v1/attention/backends/flash_attn.py b/vllm/v1/attention/backends/flash_attn.py
index d72c2aeb6161..203a4c339bac 100755
--- a/vllm/v1/attention/backends/flash_attn.py
+++ b/vllm/v1/attention/backends/flash_attn.py
@@ -10,6 +10,7 @@
 import torch
 
 from vllm.model_executor.layers.attention import Attention
+from vllm.platforms import current_platform
 from vllm.utils.torch_utils import is_quantized_kv_cache
 from vllm.v1.attention.backend import (
     AttentionBackend,
@@ -20,6 +21,7 @@
 from vllm.v1.attention.backends.fa_utils import (
     flash_attn_supports_fp8,
     get_flash_attn_version,
+    is_fa_version_supported,
     is_flash_attn_varlen_func_available,
 )
 from vllm.v1.attention.backends.utils import get_dcp_local_seq_lens
@@ -45,7 +47,6 @@
 from vllm.config.cache import CacheDType
 from vllm.distributed.parallel_state import get_dcp_group
 from vllm.logger import init_logger
-from vllm.platforms import current_platform
 from vllm.platforms.interface import DeviceCapability
 from vllm.utils.math_utils import cdiv, round_up
 from vllm.v1.attention.backend import (
@@ -171,7 +172,13 @@ def get_fp8_dtype_for_flashattn(kv_cache_dtype: str) -> torch.dtype:
 
     @classmethod
     def supports_head_size(cls, head_size: int) -> bool:
-        return head_size % 8 == 0 and head_size <= 256
+        if head_size % 8 != 0:
+            return False
+        if head_size <= 256:
+            return True
+        if is_fa_version_supported(4):
+            return head_size <= 512
+        return False
 
     @classmethod
     def supports_kv_cache_dtype(cls, kv_cache_dtype: CacheDType | None) -> bool:
@@ -619,6 +626,14 @@ def __init__(
             requires_alibi=alibi_slopes is not None,
             head_size=head_size,
         )
+        # head_size > 256 requires FA4 on SM90+; force upgrade from FA3
+        if (
+            head_size > 256
+            and self.vllm_flash_attn_version == 3
+            and current_platform.is_cuda()
+            and current_platform.is_device_capability_family(90)
+        ):
+            self.vllm_flash_attn_version = 4
         logger.info_once(
             "Using FlashAttention version %s",
             self.vllm_flash_attn_version,
diff --git a/vllm/v1/attention/backends/gdn_attn.py b/vllm/v1/attention/backends/gdn_attn.py
index 5ebf040be7ae..85715e91ab40 100644
--- a/vllm/v1/attention/backends/gdn_attn.py
+++ b/vllm/v1/attention/backends/gdn_attn.py
@@ -253,7 +253,7 @@ def build(  # type: ignore[override]
                 )
                 # Filter by spec_sequence_masks to exclude padded sequences
                 spec_state_indices_tensor = block_table_tensor[
-                    spec_sequence_masks, : self.num_spec + 1
+                    spec_sequence_masks_cpu, : self.num_spec + 1
                 ]
                 non_spec_state_indices_tensor = None
                 # Padded sequences are always at the back, so the first
@@ -264,7 +264,9 @@ def build(  # type: ignore[override]
                 non_spec_query_start_loc_cpu = None
             else:
                 spec_token_masks = torch.repeat_interleave(
-                    spec_sequence_masks, query_lens
+                    spec_sequence_masks,
+                    query_lens,
+                    output_size=query_start_loc_cpu[-1].item(),
                 )
                 index = torch.argsort(spec_token_masks, stable=True)
                 num_non_spec_tokens = num_prefill_tokens + num_decode_tokens
@@ -272,10 +274,10 @@ def build(  # type: ignore[override]
                 spec_token_indx = index[num_non_spec_tokens:]
 
                 spec_state_indices_tensor = block_table_tensor[
-                    spec_sequence_masks, : self.num_spec + 1
+                    spec_sequence_masks_cpu, : self.num_spec + 1
                 ]
                 non_spec_state_indices_tensor = block_table_tensor[
-                    ~spec_sequence_masks, 0
+                    ~spec_sequence_masks_cpu, 0
                 ]
 
                 spec_query_start_loc = torch.zeros(
@@ -284,7 +286,9 @@ def build(  # type: ignore[override]
                     device=query_start_loc.device,
                 )
                 torch.cumsum(
-                    query_lens[spec_sequence_masks], dim=0, out=spec_query_start_loc[1:]
+                    query_lens[spec_sequence_masks_cpu],
+                    dim=0,
+                    out=spec_query_start_loc[1:],
                 )
                 non_spec_query_start_loc = torch.zeros(
                     query_lens.size(0) - num_spec_decodes + 1,
@@ -292,7 +296,7 @@ def build(  # type: ignore[override]
                     device=query_start_loc.device,
                 )
                 torch.cumsum(
-                    query_lens[~spec_sequence_masks],
+                    query_lens[~spec_sequence_masks_cpu],
                     dim=0,
                     out=non_spec_query_start_loc[1:],
                 )
@@ -307,7 +311,7 @@ def build(  # type: ignore[override]
                 )
 
             assert num_accepted_tokens is not None
-            num_accepted_tokens = num_accepted_tokens[spec_sequence_masks]
+            num_accepted_tokens = num_accepted_tokens[spec_sequence_masks_cpu]
 
         chunk_indices: torch.Tensor | None = None
         chunk_offsets: torch.Tensor | None = None
@@ -331,8 +335,8 @@ def build(  # type: ignore[override]
 
         if num_prefills > 0:
             has_initial_state = context_lens_tensor > 0
-            if spec_sequence_masks is not None:
-                has_initial_state = has_initial_state[~spec_sequence_masks]
+            if spec_sequence_masks_cpu is not None:
+                has_initial_state = has_initial_state[~spec_sequence_masks_cpu]
                 assert non_spec_query_start_loc_cpu is not None
             nums_dict, batch_ptr, token_chunk_offset_ptr = (
                 compute_causal_conv1d_metadata(
diff --git a/vllm/vllm_flash_attn/flash_attn_interface.py b/vllm/vllm_flash_attn/flash_attn_interface.py
index 9d9a9be2f316..eb0dbd42383f 100644
--- a/vllm/vllm_flash_attn/flash_attn_interface.py
+++ b/vllm/vllm_flash_attn/flash_attn_interface.py
@@ -366,14 +366,7 @@ def flash_attn_varlen_func(
         )
     elif fa_version == 4:
         assert alibi_slopes is None, "Alibi is not supported in FA4"
-        # FA4 on SM90 doesn't support paged KV; SM100+ does
-        from vllm.platforms import current_platform
 
-        if block_table is not None and current_platform.is_device_capability_family(90):
-            raise NotImplementedError(
-                "FA4 with paged KV is not supported on SM90 (Hopper). "
-                "Use FA3 or upgrade to Blackwell (SM100+)."
-            )
         from vllm.vllm_flash_attn.cute.interface import _flash_attn_fwd
 
         out, softmax_lse = _flash_attn_fwd(