diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/ANALYSIS.md b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/ANALYSIS.md
new file mode 100644
index 000000000000..1084aab57101
--- /dev/null
+++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/ANALYSIS.md
@@ -0,0 +1,120 @@
+# SSM zero-on-realloc ablation: 4-arm comparison
+
+- OUT_DIR: `/tmp/ssm_ablation_suite_v2`
+- git_sha: `670724746c596f6c095970c4d50b82e6328423db`
+- image: `nvllm:gb10-d2_7`
+- N runs per arm: 5
+- gsm8k_floor: 45
+
+## Verdict table (run x correct/errors)
+
+| Arm | SSM | KV | Run 1 | Run 2 | Run 3 | Run 4 | Run 5 | Gate |
+|-----|-----|----|-------|-------|-------|-------|-------|------|
+| both | 1 | 1 | 48/0err | 48/0err | 48/0err | 48/0err | 48/0err | True |
+| neither | 0 | 0 | 48/0err | 48/0err | 48/0err | 48/0err | 48/0err | True |
+| ssm_only | 1 | 0 | 48/0err | 48/0err | 48/0err | 48/0err | 48/0err | True |
+| kv_only | 0 | 1 | 47/0err | 47/0err | 47/0err | 47/0err | 47/0err | True |
+
+## Per-question table - Run 4 (collapse window)
+
+Columns per arm: lat (wall_time_s), ct (completion_tokens), dtok/s (decode_tok_s), fr (finish_reason), ok (correct).
+
+| Q | both:lat | both:ct | both:dtok/s | both:fr | both:ok | neither:lat | neither:ct | neither:dtok/s | neither:fr | neither:ok | ssm_only:lat | ssm_only:ct | ssm_only:dtok/s | ssm_only:fr | ssm_only:ok | kv_only:lat | kv_only:ct | kv_only:dtok/s | kv_only:fr | kv_only:ok |
+|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
+| 1 | 16.47 | 153 | 9.29 | stop | N | 16.43 | 153 | 9.31 | stop | N | 16.43 | 153 | 9.31 | stop | N | 16.68 | 155 | 9.29 | stop | N |
+| 2 | 23.52 | 217 | 9.23 | stop | Y | 23.53 | 217 | 9.22 | stop | Y | 23.54 | 217 | 9.22 | stop | Y | 12.73 | 116 | 9.11 | stop | Y |
+| 3 | 11.60 | 106 | 9.14 | stop | Y | 11.60 | 106 | 9.13 | stop | Y | 11.61 | 106 | 9.13 | stop | Y | 13.88 | 127 | 9.15 | stop | Y |
+| 4 | 19.42 | 179 | 9.21 | stop | Y | 19.43 | 179 | 9.21 | stop | Y | 19.44 | 179 | 9.21 | stop | Y | 20.66 | 190 | 9.20 | stop | Y |
+| 5 | 20.82 | 192 | 9.22 | stop | Y | 20.81 | 192 | 9.22 | stop | Y | 20.83 | 192 | 9.22 | stop | Y | 16.46 | 151 | 9.18 | stop | Y |
+| 6 | 13.74 | 126 | 9.17 | stop | Y | 13.74 | 126 | 9.17 | stop | Y | 13.76 | 126 | 9.16 | stop | Y | 14.43 | 132 | 9.15 | stop | Y |
+| 7 | 13.63 | 125 | 9.17 | stop | Y | 13.64 | 125 | 9.17 | stop | Y | 13.65 | 125 | 9.16 | stop | Y | 11.20 | 102 | 9.11 | stop | Y |
+| 8 | 19.64 | 181 | 9.22 | stop | Y | 19.65 | 181 | 9.21 | stop | Y | 19.63 | 181 | 9.22 | stop | Y | 15.28 | 140 | 9.16 | stop | N |
+| 9 | 10.10 | 92 | 9.11 | stop | Y | 10.10 | 92 | 9.11 | stop | Y | 10.10 | 92 | 9.11 | stop | Y | 10.12 | 92 | 9.09 | stop | Y |
+| 10 | 13.21 | 121 | 9.16 | stop | Y | 13.21 | 121 | 9.16 | stop | Y | 13.20 | 121 | 9.16 | stop | Y | 10.88 | 99 | 9.10 | stop | Y |
+| 11 | 19.55 | 180 | 9.21 | stop | Y | 19.56 | 180 | 9.21 | stop | Y | 19.55 | 180 | 9.21 | stop | Y | 45.53 | 421 | 9.25 | stop | Y |
+| 12 | 21.24 | 196 | 9.23 | stop | Y | 21.25 | 196 | 9.22 | stop | Y | 21.24 | 196 | 9.23 | stop | Y | 22.49 | 207 | 9.21 | stop | Y |
+| 13 | 16.96 | 156 | 9.20 | stop | Y | 16.97 | 156 | 9.19 | stop | Y | 16.96 | 156 | 9.20 | stop | Y | 18.29 | 168 | 9.18 | stop | Y |
+| 14 | 15.02 | 138 | 9.19 | stop | Y | 15.03 | 138 | 9.18 | stop | Y | 15.02 | 138 | 9.19 | stop | Y | 14.74 | 135 | 9.16 | stop | Y |
+| 15 | 14.49 | 133 | 9.18 | stop | Y | 14.50 | 133 | 9.17 | stop | Y | 14.48 | 133 | 9.18 | stop | Y | 16.16 | 148 | 9.16 | stop | Y |
+| 16 | 11.28 | 103 | 9.13 | stop | Y | 11.29 | 103 | 9.13 | stop | Y | 11.27 | 103 | 9.14 | stop | Y | 10.98 | 100 | 9.10 | stop | Y |
+| 17 | 11.38 | 104 | 9.14 | stop | Y | 11.39 | 104 | 9.13 | stop | Y | 11.38 | 104 | 9.14 | stop | Y | 14.21 | 130 | 9.15 | stop | Y |
+| 18 | 13.75 | 126 | 9.17 | stop | Y | 13.75 | 126 | 9.17 | stop | Y | 13.74 | 126 | 9.17 | stop | Y | 13.03 | 119 | 9.13 | stop | Y |
+| 19 | 11.17 | 102 | 9.13 | stop | Y | 11.17 | 102 | 9.13 | stop | Y | 11.17 | 102 | 9.13 | stop | Y | 10.65 | 97 | 9.10 | stop | Y |
+| 20 | 9.98 | 91 | 9.11 | stop | Y | 9.99 | 91 | 9.11 | stop | Y | 10.00 | 91 | 9.10 | stop | Y | 9.38 | 85 | 9.07 | stop | Y |
+| 21 | 13.41 | 123 | 9.17 | stop | Y | 13.42 | 123 | 9.17 | stop | Y | 13.41 | 123 | 9.17 | stop | Y | 13.34 | 122 | 9.14 | stop | Y |
+| 22 | 26.52 | 245 | 9.24 | stop | Y | 26.52 | 245 | 9.24 | stop | Y | 26.51 | 245 | 9.24 | stop | Y | 26.26 | 242 | 9.22 | stop | Y |
+| 23 | 9.67 | 88 | 9.10 | stop | Y | 9.67 | 88 | 9.10 | stop | Y | 9.67 | 88 | 9.10 | stop | Y | 13.24 | 121 | 9.14 | stop | Y |
+| 24 | 15.45 | 142 | 9.19 | stop | Y | 15.46 | 142 | 9.19 | stop | Y | 15.44 | 142 | 9.19 | stop | Y | 15.49 | 142 | 9.17 | stop | Y |
+| 25 | 35.08 | 325 | 9.26 | stop | Y | 35.09 | 325 | 9.26 | stop | Y | 35.08 | 325 | 9.27 | stop | Y | 25.08 | 231 | 9.21 | stop | Y |
+| 26 | 16.85 | 155 | 9.20 | stop | Y | 16.86 | 155 | 9.19 | stop | Y | 16.85 | 155 | 9.20 | stop | Y | 20.34 | 187 | 9.19 | stop | Y |
+| 27 | 14.28 | 131 | 9.17 | stop | Y | 14.27 | 131 | 9.18 | stop | Y | 14.28 | 131 | 9.18 | stop | Y | 9.59 | 87 | 9.07 | stop | Y |
+| 28 | 24.69 | 228 | 9.24 | stop | Y | 24.69 | 228 | 9.23 | stop | Y | 24.68 | 228 | 9.24 | stop | Y | 27.98 | 258 | 9.22 | stop | Y |
+| 29 | 16.75 | 154 | 9.20 | stop | Y | 16.76 | 154 | 9.19 | stop | Y | 16.75 | 154 | 9.19 | stop | Y | 17.01 | 156 | 9.17 | stop | Y |
+| 30 | 11.38 | 104 | 9.14 | stop | Y | 11.37 | 104 | 9.15 | stop | Y | 11.37 | 104 | 9.14 | stop | Y | 10.33 | 94 | 9.10 | stop | Y |
+| 31 | 19.22 | 177 | 9.21 | stop | Y | 19.22 | 177 | 9.21 | stop | Y | 19.22 | 177 | 9.21 | stop | Y | 20.03 | 184 | 9.19 | stop | Y |
+| 32 | 17.93 | 165 | 9.20 | stop | Y | 17.93 | 165 | 9.20 | stop | Y | 17.93 | 165 | 9.20 | stop | Y | 18.95 | 174 | 9.18 | stop | Y |
+| 33 | 14.49 | 133 | 9.18 | stop | Y | 14.50 | 133 | 9.17 | stop | Y | 14.49 | 133 | 9.18 | stop | Y | 18.19 | 167 | 9.18 | stop | Y |
+| 34 | 25.86 | 239 | 9.24 | stop | Y | 25.87 | 239 | 9.24 | stop | Y | 25.85 | 239 | 9.25 | stop | Y | 24.64 | 227 | 9.21 | stop | Y |
+| 35 | 34.01 | 315 | 9.26 | stop | Y | 34.03 | 315 | 9.26 | stop | Y | 34.02 | 315 | 9.26 | stop | Y | 20.66 | 190 | 9.19 | stop | Y |
+| 36 | 14.74 | 135 | 9.16 | stop | Y | 14.72 | 135 | 9.17 | stop | Y | 14.72 | 135 | 9.17 | stop | Y | 16.15 | 148 | 9.16 | stop | Y |
+| 37 | 21.80 | 201 | 9.22 | stop | Y | 21.79 | 201 | 9.22 | stop | Y | 21.79 | 201 | 9.22 | stop | Y | 21.95 | 202 | 9.20 | stop | Y |
+| 38 | 21.02 | 194 | 9.23 | stop | Y | 21.03 | 194 | 9.22 | stop | Y | 21.03 | 194 | 9.22 | stop | Y | 21.30 | 196 | 9.20 | stop | Y |
+| 39 | 16.74 | 154 | 9.20 | stop | Y | 16.75 | 154 | 9.20 | stop | Y | 16.74 | 154 | 9.20 | stop | Y | 17.21 | 158 | 9.18 | stop | Y |
+| 40 | 20.17 | 186 | 9.22 | stop | Y | 20.18 | 186 | 9.21 | stop | Y | 20.18 | 186 | 9.22 | stop | Y | 12.38 | 113 | 9.13 | stop | Y |
+| 41 | 25.83 | 239 | 9.25 | stop | Y | 25.86 | 239 | 9.24 | stop | Y | 25.88 | 239 | 9.23 | stop | Y | 30.79 | 284 | 9.22 | stop | Y |
+| 42 | 17.81 | 164 | 9.21 | stop | Y | 17.82 | 164 | 9.20 | stop | Y | 17.83 | 164 | 9.20 | stop | Y | 17.00 | 156 | 9.17 | stop | Y |
+| 43 | 11.60 | 106 | 9.14 | stop | Y | 11.61 | 106 | 9.13 | stop | Y | 11.60 | 106 | 9.13 | stop | Y | 16.25 | 149 | 9.17 | stop | Y |
+| 44 | 9.03 | 82 | 9.09 | stop | Y | 9.03 | 82 | 9.08 | stop | Y | 9.02 | 82 | 9.09 | stop | Y | 12.81 | 117 | 9.13 | stop | Y |
+| 45 | 55.13 | 512 | 9.29 | length | N | 55.18 | 512 | 9.28 | length | N | 55.15 | 512 | 9.28 | length | N | 55.28 | 512 | 9.26 | length | N |
+| 46 | 21.28 | 198 | 9.30 | stop | Y | 21.26 | 198 | 9.31 | stop | Y | 21.26 | 198 | 9.31 | stop | Y | 25.19 | 234 | 9.29 | stop | Y |
+| 47 | 20.08 | 185 | 9.21 | stop | Y | 20.08 | 185 | 9.21 | stop | Y | 20.07 | 185 | 9.22 | stop | Y | 20.12 | 185 | 9.19 | stop | Y |
+| 48 | 26.50 | 245 | 9.24 | stop | Y | 26.52 | 245 | 9.24 | stop | Y | 26.50 | 245 | 9.24 | stop | Y | 29.47 | 272 | 9.23 | stop | Y |
+| 49 | 13.73 | 126 | 9.18 | stop | Y | 13.74 | 126 | 9.17 | stop | Y | 13.74 | 126 | 9.17 | stop | Y | 13.13 | 120 | 9.14 | stop | Y |
+| 50 | 22.21 | 205 | 9.23 | stop | Y | 22.22 | 205 | 9.22 | stop | Y | 22.22 | 205 | 9.22 | stop | Y | 14.75 | 135 | 9.15 | stop | Y |
+
+## Aggregate per-arm steady-state stats (concat across runs)
+
+| Arm | N | median dtok/s | p95 wall_s | mean completion_tokens | finish_reason hist |
+|-----|---|---------------|------------|------------------------|--------------------|
+| both | 250 | 9.20 | 34.01 | 169.5 | length=5, stop=245 |
+| neither | 250 | 9.19 | 34.03 | 169.5 | length=5, stop=245 |
+| ssm_only | 250 | 9.20 | 34.01 | 169.5 | length=5, stop=245 |
+| kv_only | 250 | 9.17 | 30.74 | 169.5 | length=5, stop=245 |
+
+## Friend's interpretation thresholds applied
+
+- 'real pipeline win' iff median decode_tok_s >= 1.30x baseline ('neither') AND mean completion_tokens >= 0.85x baseline
+- 'shortened generations' iff decode rate up but completion_tokens < 0.85x baseline
+
+- **both**: no decode win vs baseline (decode 1.00x, compt 1.00x)
+- **neither**: baseline
+- **ssm_only**: no decode win vs baseline (decode 1.00x, compt 1.00x)
+- **kv_only**: no decode win vs baseline (decode 1.00x, compt 1.00x)
+
+## Drained KV invariant (per-run pre vs post)
+
+Tolerance: |delta| <= 0.05 (5 pp) counts as drained.
+
+| Arm | Run | KV pre | KV post | delta | drained |
+|-----|-----|--------|---------|-------|---------|
+| both | 1 | 0.0000 | 0.0027 | 0.0027 | Y |
+| both | 2 | 0.0000 | 0.0027 | 0.0027 | Y |
+| both | 3 | 0.0000 | 0.0027 | 0.0027 | Y |
+| both | 4 | 0.0000 | 0.0027 | 0.0027 | Y |
+| both | 5 | 0.0000 | 0.0027 | 0.0027 | Y |
+| neither | 1 | 0.0000 | 0.0027 | 0.0027 | Y |
+| neither | 2 | 0.0000 | 0.0027 | 0.0027 | Y |
+| neither | 3 | 0.0000 | 0.0027 | 0.0027 | Y |
+| neither | 4 | 0.0000 | 0.0027 | 0.0027 | Y |
+| neither | 5 | 0.0000 | 0.0027 | 0.0027 | Y |
+| ssm_only | 1 | 0.0000 | 0.0027 | 0.0027 | Y |
+| ssm_only | 2 | 0.0000 | 0.0027 | 0.0027 | Y |
+| ssm_only | 3 | 0.0000 | 0.0027 | 0.0027 | Y |
+| ssm_only | 4 | 0.0000 | 0.0027 | 0.0027 | Y |
+| ssm_only | 5 | 0.0000 | 0.0027 | 0.0027 | Y |
+| kv_only | 1 | 0.0000 | 0.0027 | 0.0027 | Y |
+| kv_only | 2 | 0.0000 | 0.0027 | 0.0027 | Y |
+| kv_only | 3 | 0.0000 | 0.0027 | 0.0027 | Y |
+| kv_only | 4 | 0.0000 | 0.0027 | 0.0027 | Y |
+| kv_only | 5 | 0.0000 | 0.0027 | 0.0027 | Y |
+
diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/comparison.json b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/comparison.json
new file mode 100644
index 000000000000..569fc0b9fcd7
--- /dev/null
+++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/comparison.json
@@ -0,0 +1,15 @@
+{
+  "out_dir": "/tmp/ssm_ablation_suite_v2",
+  "git_sha": "670724746c596f6c095970c4d50b82e6328423db",
+  "image": "nvllm:gb10-d2_7",
+  "n_runs": 5,
+  "gsm8k_floor": 45,
+  "patched_repo": "/tmp/nvllm-ssm-sentinel-patched",
+  "sentinels_root": "/tmp/nvllm-ablation-sentinels-v2",
+  "arms": [
+    {"arm": "both", "ssm_sentinel": 1, "kv_sentinel": 1, "gate_pass": "true", "correct_per_run": "48,48,48,48,48", "verdict": "/tmp/ssm_ablation_suite_v2/both/verdict.json"},
+    {"arm": "neither", "ssm_sentinel": 0, "kv_sentinel": 0, "gate_pass": "true", "correct_per_run": "48,48,48,48,48", "verdict": "/tmp/ssm_ablation_suite_v2/neither/verdict.json"},
+    {"arm": "ssm_only", "ssm_sentinel": 1, "kv_sentinel": 0, "gate_pass": "true", "correct_per_run": "48,48,48,48,48", "verdict": "/tmp/ssm_ablation_suite_v2/ssm_only/verdict.json"},
+    {"arm": "kv_only", "ssm_sentinel": 0, "kv_sentinel": 1, "gate_pass": "true", "correct_per_run": "47,47,47,47,47", "verdict": "/tmp/ssm_ablation_suite_v2/kv_only/verdict.json"}
+  ]
+}
diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/runner_manifest.json b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/runner_manifest.json
new file mode 100644
index 000000000000..29ed9dc97670
--- /dev/null
+++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/runner_manifest.json
@@ -0,0 +1,20 @@
+{
+  "runner": "/tmp/run_ablation_suite_v2.sh",
+  "started_utc": "2026-05-15T18:17:46Z",
+  "git_sha": "670724746c596f6c095970c4d50b82e6328423db",
+  "image": "nvllm:gb10-d2_7",
+  "image_id": "nvllm:gb10-d2_7@4df53234ad5c",
+  "image_digest": "no-digest",
+  "patched_repo": "/tmp/nvllm-ssm-sentinel-patched",
+  "sentinels_root": "/tmp/nvllm-ablation-sentinels-v2",
+  "host_name": "navi-ai",
+  "host_driver": "590.48.01",
+  "host_kernel": "6.17.0-1014-nvidia",
+  "gsm8k_n": 50,
+  "gsm8k_seed": 42,
+  "gsm8k_max_tokens": 512,
+  "prompt_set_hash": "f422bd91dd644cc1a8afce282e51732977e1e1e5c361e894287f8eed5792e2cf",
+  "hf_model": "ig1/Qwen3.5-27B-NVFP4",
+  "n_runs": 5,
+  "arms": ["both", "neither", "ssm_only", "kv_only"]
+}
diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/summary.md b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/summary.md
new file mode 100644
index 000000000000..6df72ccc678f
--- /dev/null
+++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/summary.md
@@ -0,0 +1,137 @@
+# SSM zero-on-realloc — 4-arm sentinel ablation (2026-05-15)
+
+## Status
+
+**Harness validation only. No perf claim. No nsys trace.**
+
+The β-coop sustained-load collapse this work was designed to discriminate
+was not reproducing on the host on 2026-05-15. Per friend's reframing:
+> "If all arms pass and counters prove toggles worked: result is 'patch
+> not harmful under non-repro conditions; collapse not available for
+> discrimination.'"
+
+That is the result here. The harness is committed for the next collapse
+window.
+
+## What ran
+
+- **Suite:** 4 arms (both / neither / ssm_only / kv_only), 5×GSM8K-50
+  runs per arm, fresh container per arm, sentinel files at
+  `/run/nvllm/*.enabled` bind-mounted `:ro` per arm.
+- **Total:** 20 runs (1000 generations), ~5h 25min wall (14:17→19:41
+  EDT). 0 errors, 0 OOMs, 0 container restarts.
+- **Suite code:** `scripts/ablation/run_ssm_ablation_suite.sh`,
+  `scripts/ablation/ssm_ablation_compare.py`, overlay applied via
+  `scripts/ablation/prepare_sentinel_overlay.sh`.
+
+## Host / image manifest
+
+| Field | Value |
+|---|---|
+| started_utc | 2026-05-15T18:17:46Z |
+| git_sha | `670724746c596f6c095970c4d50b82e6328423db` (`plan/beta-coop-layer-sweep-wo8` head at suite time) |
+| image | `nvllm:gb10-d2_7` |
+| image_id | `nvllm:gb10-d2_7@4df53234ad5c` |
+| image_digest | `no-digest` (local-built image, never pushed) |
+| host_driver | `590.48.01` |
+| host_kernel | `6.17.0-1014-nvidia` |
+| hardware | NVIDIA DGX Spark (GB10, SM120, 128 GB unified) |
+| hf_model | `ig1/Qwen3.5-27B-NVFP4` |
+| served_name | `default` |
+| gsm8k_n | 50 |
+| gsm8k_seed | 42 |
+| gsm8k_max_tokens | 512 |
+| prompt_set_hash | `f422bd91dd644cc1a8afce282e51732977e1e1e5c361e894287f8eed5792e2cf` (sha256 of `n|seed|model|served-name`) |
+| phase_e_layers | `3,7` |
+| wo_split | 8 |
+
+## Per-arm verdict
+
+| Arm | SSM sentinel | KV sentinel | runs (correct/50) | first_fire (ssm,kv) | gate_pass | harness_pass |
+|---|---|---|---|---|---|---|
+| `both` | 1 | 1 | 48,48,48,48,48 | (1, 1) | true | true |
+| `neither` | 0 | 0 | 48,48,48,48,48 | (0, 0) | true | true |
+| `ssm_only` | 1 | 0 | 48,48,48,48,48 | (1, 0) | true | true |
+| `kv_only` | 0 | 1 | 47,47,47,47,47 | (0, 1) | true | true |
+
+`harness_pass=true` for all four arms means: when SSM_sentinel=1 the SSM
+gate fired (and not when SSM_sentinel=0); same for KV. The sentinel
+machinery is proven to discriminate. The env-strip confound from a prior
+env-gated attempt is eliminated.
+
+## What this shows and does not show
+
+**Shows:**
+- Sentinel-file gating works through vLLM EngineCore (env-stripped) where
+  env-var gating did not.
+- Under non-collapsing host state, the SSM zero-on-realloc patch is
+  correctness-neutral (`both` and `ssm_only` both 48/50, identical to
+  `neither` baseline) and perf-neutral (median decode within 0.03 tok/s
+  across all arms).
+- The KV `new_block_ids` channel relax (kv_only arm) is **NOT
+  correctness-neutral**: a deterministic -1 question across all 5 runs.
+  That is the basis for shipping the SSM patch alone in the production
+  commit and keeping the KV relax in the harness overlay only.
+
+**Does not show:**
+- Whether the SSM patch fixes the β-coop sustained-load collapse: the
+  collapse did not reproduce under today's host state.
+- Any performance win: median decode is flat across arms; no nsys trace
+  was captured.
+
+## Per-arm steady-state stats
+
+(See `ANALYSIS.md` Section "Aggregate per-arm steady-state stats".)
+
+| Arm | N | median dtok/s | p95 wall_s | mean completion_tokens | finish_reason |
+|---|---|---|---|---|---|
+| both | 250 | 9.20 | 34.01 | 169.5 | length=5, stop=245 |
+| neither | 250 | 9.19 | 34.03 | 169.5 | length=5, stop=245 |
+| ssm_only | 250 | 9.20 | 34.01 | 169.5 | length=5, stop=245 |
+| kv_only | 250 | 9.17 | 30.74 | 169.5 | length=5, stop=245 |
+
+Note: per-arm 50-question completion-token sums all land at 8477 tokens
+(mean 169.54). Per-Q values do differ (e.g. `kv_only` Q2 = 116 tokens vs
+217 for the other three arms — verified distinct via output sha256 and
+output_len), but per-arm sums coincide. This is a chance numerical
+balance, not a stat-collection bug.
+
+## Drained KV invariant
+
+All 20 runs drained KV cleanly: `vllm:kv_cache_usage_perc` returned to
+≤0.3pp of baseline at the post-run snapshot, well inside the 5pp
+tolerance. (See `ANALYSIS.md` for the full per-run table.)
+
+## How to reproduce
+
+```bash
+# 1. Build a sentinel-overlaid scratch checkout (~5 sec).
+scripts/ablation/prepare_sentinel_overlay.sh /tmp/nvllm-ssm-sentinel-patched
+
+# 2. Run the 4-arm sweep (~3 h with default 5 runs x 4 arms x ~15 min/run).
+scripts/ablation/run_ssm_ablation_suite.sh
+
+# 3. Produce ANALYSIS.md from the per-arm verdicts.
+.venv/bin/python scripts/ablation/ssm_ablation_compare.py /tmp/ssm_ablation_suite
+```
+
+Env overrides for the runner are documented in the script header (see
+`scripts/ablation/run_ssm_ablation_suite.sh`).
+
+## What is NOT committed
+
+- Per-arm `docker.log` (×4, ~50 MB each)
+- Per-arm `serve.log` (×4)
+- Per-run per-Q `perq.jsonl` (×20 = 1000 records, ~600 KB)
+- Per-run `metrics_*.json` (×120 snapshots)
+- The full mamba slot trace (~750 events × 4 arms)
+
+These artifacts live in the suite OUT_DIR
+(`/tmp/ssm_ablation_suite_v2/` at run time) and can be regenerated by
+re-running the harness against the committed scripts.
+
+## Related
+
+- `docs/research/2026-05-15-ssm-zero-on-realloc/README.md` — design + harness usage
+- Production patch: commit `feat(worker): add MambaBlockZeroer sister zeroer for SSM zero-on-realloc`
+- Harness commit: `test(ablation): sentinel-gated SSM zero-on-realloc ablation harness`
diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-both.json b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-both.json
new file mode 100644
index 000000000000..3bf8164430c1
--- /dev/null
+++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-both.json
@@ -0,0 +1,36 @@
+{
+  "arm": "both",
+  "ssm_sentinel": 1,
+  "kv_sentinel": 1,
+  "hypothesis": "ssm_zero_on_realloc_ablation_sentinel_gated",
+  "patched_repo": "/tmp/nvllm-ssm-sentinel-patched",
+  "sentinel_dir": "/tmp/nvllm-ablation-sentinels-v2/both",
+  "sentinel_files_inside": "kv_zero_for_mamba_ids.enabled,zero_ssm_on_realloc.enabled",
+  "container_id": "f3febab6e399ffc388d45b35e2d0cf12b525d9af2b9034df67969c80b8e8953e",
+  "host_driver": "590.48.01",
+  "prompt_set_hash": "f422bd91dd644cc1a8afce282e51732977e1e1e5c361e894287f8eed5792e2cf",
+  "harness_validation": {"pass": true, "reason": "ok", "ssm_first_fire": 1, "kv_first_fire": 1},
+  "git_sha": "670724746c596f6c095970c4d50b82e6328423db",
+  "image": "nvllm:gb10-d2_7",
+  "image_id": "nvllm:gb10-d2_7@4df53234ad5c",
+  "phase_e_layers": "3,7",
+  "phase_e_fusion": 1,
+  "phase_e_path": "auto",
+  "wo_split": 8,
+  "n_runs": 5,
+  "gsm8k_floor": 45,
+  "mamba_slot_trace_lines": 753,
+  "ablation_events": {"sentinel_check": 2, "first_fire": 2, "fire_count": 9},
+  "token_summary": {"n_questions": 250, "sum_completion_tokens": 42385, "sum_prompt_tokens": 18405, "median_wall_time_s": 16.80155, "median_decode_tok_s": 9.198},
+  "runs": [
+    {"run": 1, "correct": 48, "errors": 0, "pass": true},
+    {"run": 2, "correct": 48, "errors": 0, "pass": true},
+    {"run": 3, "correct": 48, "errors": 0, "pass": true},
+    {"run": 4, "correct": 48, "errors": 0, "pass": true},
+    {"run": 5, "correct": 48, "errors": 0, "pass": true}
+  ],
+  "container_alive_at_end": true,
+  "docker_log_corruption_hits": 0,
+  "gate_pass": true,
+  "harness_pass": true
+}
diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-kv_only.json b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-kv_only.json
new file mode 100644
index 000000000000..962b11c78550
--- /dev/null
+++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-kv_only.json
@@ -0,0 +1,36 @@
+{
+  "arm": "kv_only",
+  "ssm_sentinel": 0,
+  "kv_sentinel": 1,
+  "hypothesis": "ssm_zero_on_realloc_ablation_sentinel_gated",
+  "patched_repo": "/tmp/nvllm-ssm-sentinel-patched",
+  "sentinel_dir": "/tmp/nvllm-ablation-sentinels-v2/kv_only",
+  "sentinel_files_inside": "kv_zero_for_mamba_ids.enabled",
+  "container_id": "20d76a07f0aff2648ef278a290a3555915ad4914929f017ef98461476a7d0e6d",
+  "host_driver": "590.48.01",
+  "prompt_set_hash": "f422bd91dd644cc1a8afce282e51732977e1e1e5c361e894287f8eed5792e2cf",
+  "harness_validation": {"pass": true, "reason": "ok", "ssm_first_fire": 0, "kv_first_fire": 1},
+  "git_sha": "670724746c596f6c095970c4d50b82e6328423db",
+  "image": "nvllm:gb10-d2_7",
+  "image_id": "nvllm:gb10-d2_7@4df53234ad5c",
+  "phase_e_layers": "3,7",
+  "phase_e_fusion": 1,
+  "phase_e_path": "auto",
+  "wo_split": 8,
+  "n_runs": 5,
+  "gsm8k_floor": 45,
+  "mamba_slot_trace_lines": 753,
+  "ablation_events": {"sentinel_check": 2, "first_fire": 1, "fire_count": 7},
+  "token_summary": {"n_questions": 250, "sum_completion_tokens": 42385, "sum_prompt_tokens": 18405, "median_wall_time_s": 16.3489, "median_decode_tok_s": 9.1695},
+  "runs": [
+    {"run": 1, "correct": 47, "errors": 0, "pass": true},
+    {"run": 2, "correct": 47, "errors": 0, "pass": true},
+    {"run": 3, "correct": 47, "errors": 0, "pass": true},
+    {"run": 4, "correct": 47, "errors": 0, "pass": true},
+    {"run": 5, "correct": 47, "errors": 0, "pass": true}
+  ],
+  "container_alive_at_end": true,
+  "docker_log_corruption_hits": 0,
+  "gate_pass": true,
+  "harness_pass": true
+}
diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-neither.json b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-neither.json
new file mode 100644
index 000000000000..a190c2389e17
--- /dev/null
+++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-neither.json
@@ -0,0 +1,36 @@
+{
+  "arm": "neither",
+  "ssm_sentinel": 0,
+  "kv_sentinel": 0,
+  "hypothesis": "ssm_zero_on_realloc_ablation_sentinel_gated",
+  "patched_repo": "/tmp/nvllm-ssm-sentinel-patched",
+  "sentinel_dir": "/tmp/nvllm-ablation-sentinels-v2/neither",
+  "sentinel_files_inside": "",
+  "container_id": "f966c41b02ef6a455464d93361104fc49ab4d5420b21ec159d5ca37018c03559",
+  "host_driver": "590.48.01",
+  "prompt_set_hash": "f422bd91dd644cc1a8afce282e51732977e1e1e5c361e894287f8eed5792e2cf",
+  "harness_validation": {"pass": true, "reason": "ok", "ssm_first_fire": 0, "kv_first_fire": 0},
+  "git_sha": "670724746c596f6c095970c4d50b82e6328423db",
+  "image": "nvllm:gb10-d2_7",
+  "image_id": "nvllm:gb10-d2_7@4df53234ad5c",
+  "phase_e_layers": "3,7",
+  "phase_e_fusion": 1,
+  "phase_e_path": "auto",
+  "wo_split": 8,
+  "n_runs": 5,
+  "gsm8k_floor": 45,
+  "mamba_slot_trace_lines": 753,
+  "ablation_events": {"sentinel_check": 2, "first_fire": 0, "fire_count": 0},
+  "token_summary": {"n_questions": 250, "sum_completion_tokens": 42385, "sum_prompt_tokens": 18405, "median_wall_time_s": 16.81105, "median_decode_tok_s": 9.193},
+  "runs": [
+    {"run": 1, "correct": 48, "errors": 0, "pass": true},
+    {"run": 2, "correct": 48, "errors": 0, "pass": true},
+    {"run": 3, "correct": 48, "errors": 0, "pass": true},
+    {"run": 4, "correct": 48, "errors": 0, "pass": true},
+    {"run": 5, "correct": 48, "errors": 0, "pass": true}
+  ],
+  "container_alive_at_end": true,
+  "docker_log_corruption_hits": 0,
+  "gate_pass": true,
+  "harness_pass": true
+}
diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-ssm_only.json b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-ssm_only.json
new file mode 100644
index 000000000000..418754cb3888
--- /dev/null
+++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-ssm_only.json
@@ -0,0 +1,36 @@
+{
+  "arm": "ssm_only",
+  "ssm_sentinel": 1,
+  "kv_sentinel": 0,
+  "hypothesis": "ssm_zero_on_realloc_ablation_sentinel_gated",
+  "patched_repo": "/tmp/nvllm-ssm-sentinel-patched",
+  "sentinel_dir": "/tmp/nvllm-ablation-sentinels-v2/ssm_only",
+  "sentinel_files_inside": "zero_ssm_on_realloc.enabled",
+  "container_id": "ee4e6c9016a4e817239b9f75ce2022a6ce7af6b3713474ffcb1d7f5b1544411f",
+  "host_driver": "590.48.01",
+  "prompt_set_hash": "f422bd91dd644cc1a8afce282e51732977e1e1e5c361e894287f8eed5792e2cf",
+  "harness_validation": {"pass": true, "reason": "ok", "ssm_first_fire": 1, "kv_first_fire": 0},
+  "git_sha": "670724746c596f6c095970c4d50b82e6328423db",
+  "image": "nvllm:gb10-d2_7",
+  "image_id": "nvllm:gb10-d2_7@4df53234ad5c",
+  "phase_e_layers": "3,7",
+  "phase_e_fusion": 1,
+  "phase_e_path": "auto",
+  "wo_split": 8,
+  "n_runs": 5,
+  "gsm8k_floor": 45,
+  "mamba_slot_trace_lines": 753,
+  "ablation_events": {"sentinel_check": 2, "first_fire": 1, "fire_count": 2},
+  "token_summary": {"n_questions": 250, "sum_completion_tokens": 42385, "sum_prompt_tokens": 18405, "median_wall_time_s": 16.8064, "median_decode_tok_s": 9.197},
+  "runs": [
+    {"run": 1, "correct": 48, "errors": 0, "pass": true},
+    {"run": 2, "correct": 48, "errors": 0, "pass": true},
+    {"run": 3, "correct": 48, "errors": 0, "pass": true},
+    {"run": 4, "correct": 48, "errors": 0, "pass": true},
+    {"run": 5, "correct": 48, "errors": 0, "pass": true}
+  ],
+  "container_alive_at_end": true,
+  "docker_log_corruption_hits": 0,
+  "gate_pass": true,
+  "harness_pass": true
+}
diff --git a/docs/research/2026-05-15-ssm-zero-on-realloc/README.md b/docs/research/2026-05-15-ssm-zero-on-realloc/README.md
new file mode 100644
index 000000000000..7980c5fb984d
--- /dev/null
+++ b/docs/research/2026-05-15-ssm-zero-on-realloc/README.md
@@ -0,0 +1,121 @@
+# SSM zero-on-realloc — design + sentinel ablation harness
+
+## What
+
+The production patch (commit `feat(worker): add MambaBlockZeroer sister
+zeroer for SSM zero-on-realloc`) adds an SSM zero-on-realloc guard alongside
+the existing full-attention KV zero-on-realloc path.
+
+`KVBlockZeroer.zero_block_ids` now also walks a sister `MambaBlockZeroer` on
+the same block-ID list, zeroing recycled `conv_state` / `ssm_state` rows via
+`torch.index_fill_` before the next prefill writes into them.
+
+## Why
+
+The existing `KVBlockZeroer` (upstream PR #35219) clears full-attn KV blocks
+at request-free / block-realloc time but skips Mamba layers because the conv
+/ ssm page sizes differ from the full-attn page size and cannot share the
+Triton kernel's uniform `PAGE_SIZE_EL`. `MambaBlockZeroer` covers the
+remaining state.
+
+This addresses one half of the suspect set from the Mamba SSM cache
+lifecycle audit (memory:`project_mamba_ssm_lifecycle`):
+> "what's accumulating in-process between runs that isn't in any cherry-pick"
+
+Hybrid-attention models (Qwen3.5-27B and similar) hold per-block mamba
+state in tensors whose leading dim is `num_blocks`. When a block ID is
+recycled to a new request, the old request's mamba state in that slot
+would otherwise persist as initial state for the new prefill.
+
+## What this commit series does NOT claim
+
+- **No "fixes collapse" claim.** The β-coop sustained-load collapse was not
+  reproducing on the host at the time of this work (2026-05-15). The patch
+  is shipped because the lifecycle gap is real; the patch's effect under
+  the failing host state is unknown.
+- **No perf claim.** No nsys trace was captured. The 4-arm sentinel
+  ablation (below) shows median decode_tok_s within 0.03 tok/s across all
+  arms (perf-neutral under non-collapse load), but that is not a perf win.
+
+## The sentinel ablation harness
+
+The harness in `scripts/ablation/` lets a future operator A/B the patch
+under a future collapse window without having to rebuild the image.
+
+It applies a sentinel overlay (`scripts/ablation/ssm_sentinel_overlay.patch`)
+to a scratch checkout of the repo, replacing the production
+always-on firing path with a filesystem-sentinel gated version. Per-arm,
+the runner bind-mounts a per-arm sentinel directory at `/run/nvllm` :ro;
+the gate at module-import-time stats the sentinel file and caches the
+result.
+
+### Why sentinel files, not env vars
+
+vLLM EngineCore spawns the worker subprocess with most env vars stripped
+(memory:`feedback_vllm_enginecore_env_strip`); only `VLLM_TARGET_DEVICE`
+and `VLLM_WORKER_MULTIPROC_METHOD` survive. A previous env-gated
+ablation (`v1`) was a null A/B because the gate always read empty-string.
+Sentinel files survive subprocess spawn because the file system is the
+shared substrate.
+
+### Sentinel paths
+
+| Path | Effect when present |
+|---|---|
+| `/run/nvllm/zero_ssm_on_realloc.enabled` | SSM zero-on-realloc fires |
+| `/run/nvllm/kv_zero_for_mamba_ids.enabled` | KV `new_block_ids` channel relaxed for MambaSpec allocations |
+
+The KV channel relax is included in the overlay for completeness but is
+NOT shipped in the production patch — the 2026-05-15 4-arm sweep showed it
+introduces a deterministic -1 question on `kv_only` (47/50 × 5 vs 48/50 ×
+5 on `both`, `neither`, `ssm_only`).
+
+### Per-arm signature
+
+When the harness runs each arm, the docker log triad proves which gates
+fired:
+
+| Event | Meaning |
+|---|---|
+| `nvllm.ablation.sentinel_check name=<n> path=<p> exists=<b> enabled=<b>` | One per gate per worker process, at first call |
+| `nvllm.ablation.first_fire name=<n> n_block_ids=<N>` | One per gate the first time the patched branch fires |
+| `nvllm.ablation.fire_count name=<n> count=<N>` | Every 100th fire |
+
+`verdict.json` per arm includes:
+- `harness_validation.pass` — false if SSM_sentinel=1 but first_fire=0
+  (or vice versa), per gate
+- `harness_pass` — top-level boolean mirror
+
+### Reproducing the 2026-05-15 sweep
+
+```bash
+# Build a sentinel-overlaid scratch checkout (~5 sec).
+scripts/ablation/prepare_sentinel_overlay.sh /tmp/nvllm-ssm-sentinel-patched
+
+# Run the 4-arm sweep (~3 h with default 5 runs × 4 arms × ~15 min/run).
+scripts/ablation/run_ssm_ablation_suite.sh
+
+# Produce ANALYSIS.md from the per-arm verdicts.
+.venv/bin/python scripts/ablation/ssm_ablation_compare.py /tmp/ssm_ablation_suite
+```
+
+Env overrides for the runner are documented in the script header.
+
+## Evidence
+
+The 2026-05-15 evidence dir lives at:
+
+```
+benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/
+```
+
+See its `summary.md` for the per-arm verdict table, host/image manifest,
+and what the run did and did not prove.
+
+## Related memory
+
+- `project_beta_coop_sustained_collapse` — the closed bisection arc
+- `project_mamba_ssm_lifecycle` — the lifecycle audit that scoped this fix
+- `feedback_substrate_not_cherry_pick` — methodology lesson from D2.x
+- `feedback_vllm_enginecore_env_strip` — why env vars aren't reliable
+- `feedback_default_vs_base_path_coverage` — why we keep the harness
diff --git a/scripts/ablation/prepare_sentinel_overlay.sh b/scripts/ablation/prepare_sentinel_overlay.sh
new file mode 100755
index 000000000000..4e15b2fdc904
--- /dev/null
+++ b/scripts/ablation/prepare_sentinel_overlay.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+# Build a sentinel-overlaid scratch checkout for run_ssm_ablation_suite.sh.
+#
+# Usage:
+#   scripts/ablation/prepare_sentinel_overlay.sh [SCRATCH_DIR]
+#
+# Defaults SCRATCH_DIR to /tmp/nvllm-ssm-sentinel-patched.
+# Clones the current repo HEAD into SCRATCH_DIR, applies the sentinel
+# overlay patch, and verifies the marker strings landed.
+#
+# The runner expects $PATCHED_REPO to point at SCRATCH_DIR.
+
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+REPO_ROOT="$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel)"
+SCRATCH_DIR="${1:-/tmp/nvllm-ssm-sentinel-patched}"
+OVERLAY="$SCRIPT_DIR/ssm_sentinel_overlay.patch"
+
+if [ ! -f "$OVERLAY" ]; then
+  echo "ERROR: overlay patch missing: $OVERLAY" >&2
+  exit 1
+fi
+
+if [ -e "$SCRATCH_DIR" ]; then
+  echo "INFO: removing existing $SCRATCH_DIR"
+  rm -rf "$SCRATCH_DIR"
+fi
+
+CURRENT_SHA="$(git -C "$REPO_ROOT" rev-parse HEAD)"
+echo "cloning $REPO_ROOT @ $CURRENT_SHA -> $SCRATCH_DIR"
+git clone --no-local "$REPO_ROOT" "$SCRATCH_DIR" >/dev/null
+git -C "$SCRATCH_DIR" checkout --detach "$CURRENT_SHA" >/dev/null 2>&1
+
+echo "applying $OVERLAY"
+git -C "$SCRATCH_DIR" apply "$OVERLAY"
+
+# Verify markers.
+SSM_HITS=$(grep -c _SSM_ZERO_SENTINEL "$SCRATCH_DIR/vllm/v1/worker/utils.py" || echo 0)
+KV_HITS=$(grep -c _KV_ZERO_SENTINEL "$SCRATCH_DIR/vllm/v1/core/single_type_kv_cache_manager.py" || echo 0)
+if [ "$SSM_HITS" -lt 1 ] || [ "$KV_HITS" -lt 1 ]; then
+  echo "ERROR: sentinel markers missing after overlay (SSM=$SSM_HITS KV=$KV_HITS)" >&2
+  exit 1
+fi
+
+echo "done: PATCHED_REPO=$SCRATCH_DIR ready"
+echo "next:  PATCHED_REPO=$SCRATCH_DIR scripts/ablation/run_ssm_ablation_suite.sh"
diff --git a/scripts/ablation/run_ssm_ablation_suite.sh b/scripts/ablation/run_ssm_ablation_suite.sh
new file mode 100755
index 000000000000..8a072858e76a
--- /dev/null
+++ b/scripts/ablation/run_ssm_ablation_suite.sh
@@ -0,0 +1,552 @@
+#!/usr/bin/env bash
+# Sentinel-gated 4-arm SSM zero-on-realloc ablation suite.
+#
+# Per-arm bind-mounts a per-arm sentinel dir at /run/nvllm :ro into the
+# container; the sentinel-gated overlay reads filesystem-existence as the
+# toggle (env vars are stripped by vLLM EngineCore subprocess spawn — see
+# memory:feedback_vllm_enginecore_env_strip).
+#
+# Sentinel files (presence == ENABLED, absence == DISABLED):
+#   /run/nvllm/zero_ssm_on_realloc.enabled
+#   /run/nvllm/kv_zero_for_mamba_ids.enabled
+#
+# Execution proof comes from a docker-log triad emitted by the sentinel
+# overlay (apply scripts/ablation/ssm_sentinel_overlay.patch to a clean
+# checkout to build $PATCHED_REPO):
+#   nvllm.ablation.sentinel_check name=<n> exists=<b> enabled=<b>
+#   nvllm.ablation.first_fire     name=<n> n_block_ids=<N>
+#   nvllm.ablation.fire_count     name=<n> count=<N>
+#
+# Arm matrix:
+#   both     - both sentinels present  (full patch active)
+#   neither  - no sentinels            (baseline)
+#   ssm_only - SSM sentinel only       (mamba zeroer only)
+#   kv_only  - KV sentinel only        (KV new-block-ids channel relax only)
+#
+# Usage:
+#   scripts/ablation/run_ssm_ablation_suite.sh           # default 4 arms x 5 runs
+#   scripts/ablation/run_ssm_ablation_suite.sh --force   # overwrite OUT_DIR
+#
+# Env overrides:
+#   OUT_DIR             default /tmp/ssm_ablation_suite
+#   NVLLM_IMAGE         default nvllm:gb10
+#   REPO_ROOT           default git toplevel of this script
+#   PATCHED_REPO        default /tmp/nvllm-ssm-sentinel-patched
+#                         (must contain the sentinel overlay applied to a clean
+#                          checkout; see scripts/ablation/ssm_sentinel_overlay.patch)
+#   SENTINELS_ROOT      default /tmp/nvllm-ablation-sentinels
+#   N_RUNS              default 5
+#   GSM8K_FLOOR         default 45
+#   CONTAINER           default nvllm-ssm-ablation
+#   READY_TIMEOUT_S     default 600
+
+set +u
+FORCE=0
+for arg in "$@"; do
+  case "$arg" in
+    --force) FORCE=1 ;;
+    *) echo "ERROR: unknown argument: $arg (only --force is accepted)" >&2; exit 64 ;;
+  esac
+done
+
+# ---------------------------------------------------------------------------
+# Defaults / inputs. Resolved BEFORE set -e per memory:feedback_bash_runner_patterns.
+# ---------------------------------------------------------------------------
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+OUT_DIR="${OUT_DIR:-/tmp/ssm_ablation_suite}"
+NVLLM_IMAGE="${NVLLM_IMAGE:-nvllm:gb10}"
+REPO_ROOT="${REPO_ROOT:-$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel 2>/dev/null || echo "")}"
+PATCHED_REPO="${PATCHED_REPO:-/tmp/nvllm-ssm-sentinel-patched}"
+SENTINELS_ROOT="${SENTINELS_ROOT:-/tmp/nvllm-ablation-sentinels}"
+N_RUNS="${N_RUNS:-5}"
+GSM8K_N="${GSM8K_N:-50}"
+GSM8K_SEED="${GSM8K_SEED:-42}"
+GSM8K_MAX_TOKENS="${GSM8K_MAX_TOKENS:-512}"
+GSM8K_TIMEOUT="${GSM8K_TIMEOUT:-600}"
+GSM8K_FLOOR="${GSM8K_FLOOR:-45}"
+CONTAINER="${CONTAINER:-nvllm-ssm-ablation}"
+READY_TIMEOUT_S="${READY_TIMEOUT_S:-600}"
+API="http://localhost:8000/v1"
+METRICS_URL="http://localhost:8000/metrics"
+HF_MODEL="${HF_MODEL:-ig1/Qwen3.5-27B-NVFP4}"
+SERVED_NAME="${SERVED_NAME:-default}"
+
+# ---------------------------------------------------------------------------
+# Validate inputs BEFORE set -e.
+# ---------------------------------------------------------------------------
+if [ -z "$REPO_ROOT" ] || ! git -C "$REPO_ROOT" rev-parse --git-dir >/dev/null 2>&1; then
+  echo "ERROR: REPO_ROOT='$REPO_ROOT' is not a git working tree" >&2; exit 1
+fi
+if ! docker image inspect "$NVLLM_IMAGE" >/dev/null 2>&1; then
+  echo "ERROR: docker image '$NVLLM_IMAGE' not found" >&2; exit 1
+fi
+for f in vllm/v1/worker/utils.py vllm/v1/worker/gpu_model_runner.py vllm/v1/core/single_type_kv_cache_manager.py; do
+  if [ ! -f "$PATCHED_REPO/$f" ]; then
+    echo "ERROR: patched file missing: $PATCHED_REPO/$f" >&2
+    echo "       Did you apply scripts/ablation/ssm_sentinel_overlay.patch to a clean checkout at PATCHED_REPO?" >&2
+    exit 1
+  fi
+done
+GSM8K_SCRIPT="$REPO_ROOT/scripts/gsm8k_eval_50.py"
+if [ ! -f "$GSM8K_SCRIPT" ]; then
+  echo "ERROR: gsm8k_eval_50.py missing at $GSM8K_SCRIPT" >&2; exit 1
+fi
+if [ ! -x "$REPO_ROOT/.venv/bin/python" ]; then
+  echo "ERROR: $REPO_ROOT/.venv/bin/python not found" >&2; exit 1
+fi
+if ! grep -q -- '--run-index' "$GSM8K_SCRIPT" || ! grep -q -- '--metrics-url' "$GSM8K_SCRIPT"; then
+  echo "ERROR: $GSM8K_SCRIPT missing --run-index / --metrics-url (not instrumented)" >&2; exit 1
+fi
+# Smoke-test that the patched files actually contain the sentinel markers.
+if ! grep -q "_SSM_ZERO_SENTINEL" "$PATCHED_REPO/vllm/v1/worker/utils.py"; then
+  echo "ERROR: $PATCHED_REPO/vllm/v1/worker/utils.py missing _SSM_ZERO_SENTINEL marker (overlay not applied?)" >&2; exit 1
+fi
+if ! grep -q "_KV_ZERO_SENTINEL" "$PATCHED_REPO/vllm/v1/core/single_type_kv_cache_manager.py"; then
+  echo "ERROR: $PATCHED_REPO/vllm/v1/core/single_type_kv_cache_manager.py missing _KV_ZERO_SENTINEL marker (overlay not applied?)" >&2; exit 1
+fi
+
+# Refuse stale OUT_DIR unless --force.
+if [ -d "$OUT_DIR" ] && [ "$(ls -A "$OUT_DIR" 2>/dev/null)" ] && [ "$FORCE" -ne 1 ]; then
+  echo "ERROR: $OUT_DIR is non-empty (rerun with --force to overwrite)" >&2; exit 1
+fi
+rm -rf "$OUT_DIR"
+mkdir -p "$OUT_DIR"
+
+set -euo pipefail
+log() { printf '[%(%Y-%m-%d %H:%M:%S)T] %s\n' -1 "$*"; }
+
+# Counter helper: awk-based count of lines matching a literal substring.
+# Replaces `grep -c PATTERN file || echo 0`, which emitted "0\n0" when grep
+# found zero matches (grep prints "0" + exits 1, triggering the || fallback).
+count_substr() {
+  local pattern="$1"
+  local file="$2"
+  if [ ! -f "$file" ]; then
+    printf '0'
+    return
+  fi
+  awk -v pat="$pattern" 'index($0, pat) { n++ } END { print n+0 }' "$file"
+}
+
+# ---------------------------------------------------------------------------
+# Per-arm sentinel directories. SENTINELS_ROOT is rebuilt every run so we
+# can be sure no stray sentinel from a prior arm leaks in.
+# ---------------------------------------------------------------------------
+rm -rf "$SENTINELS_ROOT"
+mkdir -p "$SENTINELS_ROOT"/{both,neither,ssm_only,kv_only}
+touch "$SENTINELS_ROOT/both/zero_ssm_on_realloc.enabled"
+touch "$SENTINELS_ROOT/both/kv_zero_for_mamba_ids.enabled"
+touch "$SENTINELS_ROOT/ssm_only/zero_ssm_on_realloc.enabled"
+touch "$SENTINELS_ROOT/kv_only/kv_zero_for_mamba_ids.enabled"
+# 'neither/' stays empty by design.
+
+log "sentinel dirs prepared:"
+for arm in both neither ssm_only kv_only; do
+  files=$(ls "$SENTINELS_ROOT/$arm" 2>/dev/null | tr '\n' ',' | sed 's/,$//')
+  log "  $SENTINELS_ROOT/$arm = [$files]"
+done
+
+# Common bind-mounts (patch files are pre-built in $PATCHED_REPO; no apply step).
+PATCHED_FILES=(
+  "vllm/v1/core/single_type_kv_cache_manager.py"
+  "vllm/v1/worker/utils.py"
+  "vllm/v1/worker/gpu_model_runner.py"
+)
+BIND_MOUNTS=()
+for f in "${PATCHED_FILES[@]}"; do
+  BIND_MOUNTS+=(-v "$PATCHED_REPO/$f:/app/nvllm/$f")
+done
+
+GIT_SHA="$(git -C "$REPO_ROOT" rev-parse HEAD)"
+IMAGE_ID="$(docker images --format '{{.Repository}}:{{.Tag}}@{{.ID}}' "$NVLLM_IMAGE" | head -n1)"
+IMAGE_DIGEST="$(docker inspect --format '{{index .RepoDigests 0}}' "$NVLLM_IMAGE" 2>/dev/null || true)"
+IMAGE_DIGEST="${IMAGE_DIGEST:-no-digest}"
+HOST_DRIVER="$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -n1 || echo "unknown")"
+HOST_KERNEL="$(uname -r 2>/dev/null || echo unknown)"
+HOST_NAME="$(hostname 2>/dev/null || echo unknown)"
+# Deterministic prompt-set identifier: (n, seed, model, served-name).
+PROMPT_SET_HASH="$(printf '%s|%s|%s|%s' "$GSM8K_N" "$GSM8K_SEED" "$HF_MODEL" "$SERVED_NAME" | sha256sum | awk '{print $1}')"
+
+# One-time runner manifest written before any arm runs.
+{
+  echo "{"
+  echo "  \"runner\": \"$0\","
+  echo "  \"started_utc\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\","
+  echo "  \"git_sha\": \"$GIT_SHA\","
+  echo "  \"image\": \"$NVLLM_IMAGE\","
+  echo "  \"image_id\": \"$IMAGE_ID\","
+  echo "  \"image_digest\": \"$IMAGE_DIGEST\","
+  echo "  \"patched_repo\": \"$PATCHED_REPO\","
+  echo "  \"sentinels_root\": \"$SENTINELS_ROOT\","
+  echo "  \"host_name\": \"$HOST_NAME\","
+  echo "  \"host_driver\": \"$HOST_DRIVER\","
+  echo "  \"host_kernel\": \"$HOST_KERNEL\","
+  echo "  \"gsm8k_n\": $GSM8K_N,"
+  echo "  \"gsm8k_seed\": $GSM8K_SEED,"
+  echo "  \"gsm8k_max_tokens\": $GSM8K_MAX_TOKENS,"
+  echo "  \"prompt_set_hash\": \"$PROMPT_SET_HASH\","
+  echo "  \"hf_model\": \"$HF_MODEL\","
+  echo "  \"n_runs\": $N_RUNS,"
+  echo "  \"arms\": [\"both\", \"neither\", \"ssm_only\", \"kv_only\"]"
+  echo "}"
+} > "$OUT_DIR/runner_manifest.json"
+log "runner manifest: $OUT_DIR/runner_manifest.json"
+
+# ---------------------------------------------------------------------------
+# Arm matrix.
+# ---------------------------------------------------------------------------
+ARM_NAMES=(both neither ssm_only kv_only)
+declare -A ARM_SSM=( [both]=1 [neither]=0 [ssm_only]=1 [kv_only]=0 )
+declare -A ARM_KV=(  [both]=1 [neither]=0 [ssm_only]=0 [kv_only]=1 )
+declare -a ARM_GATE_PASS
+declare -a ARM_GIT_SUMMARY
+
+for arm_idx in "${!ARM_NAMES[@]}"; do
+  ARM="${ARM_NAMES[$arm_idx]}"
+  SSM_VAL="${ARM_SSM[$ARM]}"
+  KV_VAL="${ARM_KV[$ARM]}"
+  ARM_DIR="$OUT_DIR/$ARM"
+  ARM_SENTINEL_DIR="$SENTINELS_ROOT/$ARM"
+  mkdir -p "$ARM_DIR"
+  log "========================================================================"
+  log "ARM $((arm_idx + 1))/4: $ARM (SSM_sentinel=$SSM_VAL, KV_sentinel=$KV_VAL)"
+  log "========================================================================"
+
+  arm_files=$(ls "$ARM_SENTINEL_DIR" 2>/dev/null | tr '\n' ',' | sed 's/,$//')
+  log "nvllm.ablation.arm=$ARM host_sentinels_dir=$ARM_SENTINEL_DIR container_sentinels_dir=/run/nvllm files=[$arm_files]"
+
+  docker rm -f "$CONTAINER" >/dev/null 2>&1 || true
+
+  HOST_TRACE_DIR="$ARM_DIR/trace"
+  mkdir -p "$HOST_TRACE_DIR"
+  CONT_TRACE_PATH="/tmp/ssm_zero_trace/mamba_slot_trace.jsonl"
+  SERVE_LOG="$ARM_DIR/serve.log"
+
+  log "boot patched server (arm=$ARM, image=$NVLLM_IMAGE, container=$CONTAINER)"
+  # shellcheck disable=SC2086
+  docker run -d \
+    --name "$CONTAINER" \
+    --gpus all \
+    --ipc=host \
+    --network host \
+    --shm-size=8g \
+    -v "$HOME/.cache/huggingface:/root/.cache/huggingface" \
+    -v "$HOME/.cache/flashinfer:/root/.cache/flashinfer" \
+    -v "$HOST_TRACE_DIR:/tmp/ssm_zero_trace" \
+    -v "$ARM_SENTINEL_DIR:/run/nvllm:ro" \
+    "${BIND_MOUNTS[@]}" \
+    -e VLLM_NVFP4_GEMM_BACKEND=cutlass \
+    -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \
+    -e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \
+    -e NVLLM_MAMBA_SLOT_TRACE="$CONT_TRACE_PATH" \
+    -e CUTE_PHASE_E_FUSION=1 \
+    -e CUTE_PHASE_E_PATH=auto \
+    -e CUTE_PHASE_E_LAYERS="3,7" \
+    -e CUTE_PHASE_E_FALLBACK_RAISE=1 \
+    -e CUTE_WO_SPLIT=8 \
+    "$NVLLM_IMAGE" \
+    serve \
+    --model "$HF_MODEL" \
+    --served-model-name "$SERVED_NAME" \
+    --host 0.0.0.0 --port 8000 \
+    --gpu-memory-utilization 0.85 \
+    > "$SERVE_LOG" 2>&1
+
+  # Clear stale .pyc from bind-mounted dirs (memory:feedback_docker_bindmount).
+  sleep 2
+  docker exec "$CONTAINER" sh -c '
+    find /app/nvllm/vllm/v1/core /app/nvllm/vllm/v1/worker \
+         -maxdepth 3 -name "__pycache__" -type d \
+         -exec rm -rf {} + 2>/dev/null || true
+  ' || true
+
+  # Active readiness probe (memory:feedback_active_serve_readiness_probe).
+  deadline=$((SECONDS + READY_TIMEOUT_S))
+  log "wait for ready on $API/models ..."
+  READY=0
+  while [ "$SECONDS" -lt "$deadline" ]; do
+    if ! docker ps --filter "name=^/${CONTAINER}$" --format '{{.Names}}' | grep -qx "$CONTAINER"; then
+      echo "ERROR: container died during boot (arm=$ARM); tail $SERVE_LOG" >&2
+      docker logs "$CONTAINER" > "$ARM_DIR/docker.log" 2>&1 || true
+      ARM_GATE_PASS[$arm_idx]="boot_fail"
+      ARM_GIT_SUMMARY[$arm_idx]="-"
+      break
+    fi
+    if curl -fsS "$API/models" >/dev/null 2>&1; then
+      if curl -fsS "$API/completions" -H 'Content-Type: application/json' \
+          -d '{"model":"'"$SERVED_NAME"'","prompt":"warmup","max_tokens":8,"temperature":0}' \
+          >/dev/null 2>&1; then
+        log "ready (~${SECONDS}s)"
+        READY=1
+        break
+      fi
+    fi
+    sleep 5
+  done
+  if [ "$READY" -ne 1 ]; then
+    log "WARN: arm=$ARM did not become ready within ${READY_TIMEOUT_S}s"
+    docker logs "$CONTAINER" > "$ARM_DIR/docker.log" 2>&1 || true
+    docker rm -f "$CONTAINER" >/dev/null 2>&1 || true
+    {
+      echo "{"
+      echo "  \"arm\": \"$ARM\","
+      echo "  \"ssm_sentinel\": $SSM_VAL,"
+      echo "  \"kv_sentinel\": $KV_VAL,"
+      echo "  \"git_sha\": \"$GIT_SHA\","
+      echo "  \"image\": \"$NVLLM_IMAGE\","
+      echo "  \"image_id\": \"$IMAGE_ID\","
+      echo "  \"n_runs\": $N_RUNS,"
+      echo "  \"gsm8k_floor\": $GSM8K_FLOOR,"
+      echo "  \"runs\": [],"
+      echo "  \"container_alive_at_end\": false,"
+      echo "  \"gate_pass\": false,"
+      echo "  \"reason\": \"server_never_ready\""
+      echo "}"
+    } > "$ARM_DIR/verdict.json"
+    ARM_GATE_PASS[$arm_idx]="not_ready"
+    ARM_GIT_SUMMARY[$arm_idx]="-"
+    continue
+  fi
+
+  # Bind-mount proof: sentinel marker present in patched utils.py inside container.
+  INSIDE_MARKER=$(docker exec "$CONTAINER" sh -c "grep -c '_SSM_ZERO_SENTINEL' /app/nvllm/vllm/v1/worker/utils.py 2>/dev/null" || printf '0')
+  INSIDE_MARKER=${INSIDE_MARKER:-0}
+  if [ "$INSIDE_MARKER" -lt 1 ]; then
+    echo "FAIL: bind-mount did not land inside container (arm=$ARM)" >&2
+    docker logs "$CONTAINER" > "$ARM_DIR/docker.log" 2>&1 || true
+    docker rm -f "$CONTAINER" >/dev/null 2>&1 || true
+    ARM_GATE_PASS[$arm_idx]="bind_fail"
+    ARM_GIT_SUMMARY[$arm_idx]="-"
+    continue
+  fi
+  log "bind-mount verified inside container (marker count=$INSIDE_MARKER)"
+
+  # Sentinel-dir proof: docker exec ls /run/nvllm
+  SENTINEL_LIST_INSIDE="$(docker exec "$CONTAINER" sh -c 'ls /run/nvllm 2>/dev/null | tr "\n" "," | sed "s/,$//"' || true)"
+  log "sentinels inside container /run/nvllm = [$SENTINEL_LIST_INSIDE]"
+
+  CONTAINER_ID="$(docker inspect --format '{{.Id}}' "$CONTAINER" 2>/dev/null || echo unknown)"
+  log "container id: $CONTAINER_ID"
+  docker inspect "$CONTAINER" > "$ARM_DIR/docker_inspect.json" 2>/dev/null || true
+
+  declare -a RUN_RESULTS=()
+
+  for run_idx in $(seq 1 "$N_RUNS"); do
+    RUN_DIR="$ARM_DIR/run${run_idx}"
+    mkdir -p "$RUN_DIR"
+    log "==> arm=$ARM run ${run_idx}/${N_RUNS} GSM8K-${GSM8K_N}"
+    set +e
+    ( cd "$REPO_ROOT" && \
+      .venv/bin/python "$GSM8K_SCRIPT" \
+        --api "$API" --model "$SERVED_NAME" \
+        --n "$GSM8K_N" --seed "$GSM8K_SEED" \
+        --max-tokens "$GSM8K_MAX_TOKENS" --timeout "$GSM8K_TIMEOUT" \
+        --label "ablation_${ARM}_run${run_idx}" \
+        --run-index "$run_idx" \
+        --metrics-url "$METRICS_URL" \
+        --save "$RUN_DIR/gsm8k.json" ) 2>&1 | tee "$RUN_DIR/gsm8k.log"
+    RC_GSM="${PIPESTATUS[0]}"
+    set -e
+    if [ "$RC_GSM" -ne 0 ]; then
+      log "WARN: arm=$ARM run ${run_idx} returned rc=$RC_GSM"
+    fi
+    if [ ! -f "$RUN_DIR/gsm8k.json" ]; then
+      log "FAIL: arm=$ARM run ${run_idx} did not produce gsm8k.json"
+      RUN_RESULTS+=("$run_idx fail no-json")
+      continue
+    fi
+    CORRECT="$("$REPO_ROOT/.venv/bin/python" -c "import json; print(json.load(open('$RUN_DIR/gsm8k.json'))['correct'])")"
+    ERRORS="$("$REPO_ROOT/.venv/bin/python" -c "import json; print(json.load(open('$RUN_DIR/gsm8k.json'))['errors'])")"
+    RUN_RESULTS+=("$run_idx $CORRECT $ERRORS")
+    log "<== arm=$ARM run ${run_idx} correct=$CORRECT errors=$ERRORS"
+  done
+
+  # Final capture + teardown for this arm.
+  docker logs "$CONTAINER" > "$ARM_DIR/docker.log" 2>&1 || true
+  CONTAINER_ALIVE="false"
+  if docker ps --filter "name=^/${CONTAINER}$" --format '{{.Names}}' | grep -qx "$CONTAINER"; then
+    CONTAINER_ALIVE="true"
+  fi
+
+  # Extract ablation event triad from docker logs - this is the execution proof.
+  ABLATION_EVENTS="$ARM_DIR/ablation_events.log"
+  grep -E "nvllm.ablation" "$ARM_DIR/docker.log" > "$ABLATION_EVENTS" || true
+  SENTINEL_CHECK_LINES=$(count_substr "nvllm.ablation.sentinel_check" "$ABLATION_EVENTS")
+  FIRST_FIRE_LINES=$(count_substr "nvllm.ablation.first_fire" "$ABLATION_EVENTS")
+  FIRE_COUNT_LINES=$(count_substr "nvllm.ablation.fire_count" "$ABLATION_EVENTS")
+  # Per-gate breakdown: did SSM gate fire? did KV gate fire?
+  SSM_FIRST_FIRE=$(count_substr "nvllm.ablation.first_fire name=ssm_zero_on_realloc" "$ABLATION_EVENTS")
+  KV_FIRST_FIRE=$(count_substr "nvllm.ablation.first_fire name=kv_zero_for_mamba_ids" "$ABLATION_EVENTS")
+  log "ablation events for $ARM: sentinel_check=$SENTINEL_CHECK_LINES first_fire=$FIRST_FIRE_LINES fire_count=$FIRE_COUNT_LINES (ssm_fire=$SSM_FIRST_FIRE kv_fire=$KV_FIRST_FIRE)"
+
+  # Harness validation gate: enabled => first_fire>=1; disabled => first_fire==0.
+  HARNESS_PASS="true"
+  HARNESS_REASON="ok"
+  if [ "$SSM_VAL" -eq 1 ] && [ "$SSM_FIRST_FIRE" -lt 1 ]; then
+    HARNESS_PASS="false"; HARNESS_REASON="ssm_enabled_but_no_first_fire"
+  fi
+  if [ "$SSM_VAL" -eq 0 ] && [ "$SSM_FIRST_FIRE" -gt 0 ]; then
+    HARNESS_PASS="false"; HARNESS_REASON="ssm_disabled_but_first_fire_observed"
+  fi
+  if [ "$KV_VAL" -eq 1 ] && [ "$KV_FIRST_FIRE" -lt 1 ]; then
+    HARNESS_PASS="false"; HARNESS_REASON="kv_enabled_but_no_first_fire"
+  fi
+  if [ "$KV_VAL" -eq 0 ] && [ "$KV_FIRST_FIRE" -gt 0 ]; then
+    HARNESS_PASS="false"; HARNESS_REASON="kv_disabled_but_first_fire_observed"
+  fi
+  log "harness validation for $ARM: pass=$HARNESS_PASS reason=$HARNESS_REASON"
+
+  docker rm -f "$CONTAINER" >/dev/null 2>&1 || true
+
+  TRACE_FILE="$HOST_TRACE_DIR/mamba_slot_trace.jsonl"
+  TRACE_LINES=0
+  if [ -f "$TRACE_FILE" ]; then
+    TRACE_LINES=$(wc -l < "$TRACE_FILE" | tr -d ' ')
+  fi
+
+  # Gate evaluation.
+  ALL_PASS="true"
+  for line in "${RUN_RESULTS[@]}"; do
+    read -r idx correct errors <<< "$line"
+    if [ "$correct" = "fail" ]; then ALL_PASS="false"; continue; fi
+    if [ "$correct" -lt "$GSM8K_FLOOR" ]; then ALL_PASS="false"; fi
+    if [ "$errors" -gt 0 ]; then ALL_PASS="false"; fi
+  done
+
+  CORRUPT_HITS=0
+  if [ -f "$ARM_DIR/docker.log" ]; then
+    CORRUPT_HITS=$(awk '/ERROR|FATAL|state.*corrupt/{n++} END{print n+0}' "$ARM_DIR/docker.log")
+  fi
+  [ "$CORRUPT_HITS" -gt 0 ] && ALL_PASS="false"
+  [ "$CONTAINER_ALIVE" != "true" ] && ALL_PASS="false"
+
+  # Per-arm token summary.
+  PERQ_FILE="$ARM_DIR/perq.jsonl"
+  rm -f "$PERQ_FILE"
+  for run_idx in $(seq 1 "$N_RUNS"); do
+    if [ -f "$ARM_DIR/run${run_idx}/perq.jsonl" ]; then
+      cat "$ARM_DIR/run${run_idx}/perq.jsonl" >> "$PERQ_FILE"
+    fi
+  done
+  TOKEN_SUMMARY="$($REPO_ROOT/.venv/bin/python - <<EOF
+import json, statistics
+sumc, sump, count = 0, 0, 0
+walls, decode_rates = [], []
+try:
+    with open("$PERQ_FILE") as f:
+        for line in f:
+            try:
+                r = json.loads(line)
+            except Exception:
+                continue
+            count += 1
+            sumc += int(r.get("completion_tokens", 0) or 0)
+            sump += int(r.get("prompt_tokens", 0) or 0)
+            w = float(r.get("wall_time_s", 0) or 0)
+            d = float(r.get("decode_tok_s", 0) or 0)
+            if w > 0:
+                walls.append(w)
+            if d > 0:
+                decode_rates.append(d)
+except FileNotFoundError:
+    pass
+print(json.dumps({
+    "n_questions": count,
+    "sum_completion_tokens": sumc,
+    "sum_prompt_tokens": sump,
+    "median_wall_time_s": (statistics.median(walls) if walls else 0.0),
+    "median_decode_tok_s": (statistics.median(decode_rates) if decode_rates else 0.0),
+}))
+EOF
+)"
+
+  {
+    echo "{"
+    echo "  \"arm\": \"$ARM\","
+    echo "  \"ssm_sentinel\": $SSM_VAL,"
+    echo "  \"kv_sentinel\": $KV_VAL,"
+    echo "  \"hypothesis\": \"ssm_zero_on_realloc_ablation_sentinel_gated\","
+    echo "  \"patched_repo\": \"$PATCHED_REPO\","
+    echo "  \"sentinel_dir\": \"$ARM_SENTINEL_DIR\","
+    echo "  \"sentinel_files_inside\": \"$SENTINEL_LIST_INSIDE\","
+    echo "  \"container_id\": \"$CONTAINER_ID\","
+    echo "  \"host_driver\": \"$HOST_DRIVER\","
+    echo "  \"prompt_set_hash\": \"$PROMPT_SET_HASH\","
+    echo "  \"harness_validation\": {\"pass\": $HARNESS_PASS, \"reason\": \"$HARNESS_REASON\", \"ssm_first_fire\": $SSM_FIRST_FIRE, \"kv_first_fire\": $KV_FIRST_FIRE},"
+    echo "  \"git_sha\": \"$GIT_SHA\","
+    echo "  \"image\": \"$NVLLM_IMAGE\","
+    echo "  \"image_id\": \"$IMAGE_ID\","
+    echo "  \"phase_e_layers\": \"3,7\","
+    echo "  \"phase_e_fusion\": 1,"
+    echo "  \"phase_e_path\": \"auto\","
+    echo "  \"wo_split\": 8,"
+    echo "  \"n_runs\": $N_RUNS,"
+    echo "  \"gsm8k_floor\": $GSM8K_FLOOR,"
+    echo "  \"mamba_slot_trace_lines\": $TRACE_LINES,"
+    echo "  \"ablation_events\": {\"sentinel_check\": $SENTINEL_CHECK_LINES, \"first_fire\": $FIRST_FIRE_LINES, \"fire_count\": $FIRE_COUNT_LINES},"
+    echo "  \"token_summary\": $TOKEN_SUMMARY,"
+    echo "  \"runs\": ["
+    first=1
+    for line in "${RUN_RESULTS[@]}"; do
+      read -r idx correct errors <<< "$line"
+      [ "$first" -eq 0 ] && echo "," || true
+      first=0
+      if [ "$correct" = "fail" ]; then
+        echo -n "    {\"run\": $idx, \"ok\": false, \"reason\": \"no_gsm8k_json\"}"
+      else
+        pass=true
+        [ "$correct" -lt "$GSM8K_FLOOR" ] && pass=false
+        [ "$errors" -gt 0 ] && pass=false
+        echo -n "    {\"run\": $idx, \"correct\": $correct, \"errors\": $errors, \"pass\": $pass}"
+      fi
+    done
+    echo
+    echo "  ],"
+    echo "  \"container_alive_at_end\": $CONTAINER_ALIVE,"
+    echo "  \"docker_log_corruption_hits\": $CORRUPT_HITS,"
+    echo "  \"gate_pass\": $ALL_PASS,"
+    echo "  \"harness_pass\": $HARNESS_PASS"
+    echo "}"
+  } > "$ARM_DIR/verdict.json"
+
+  ARM_GATE_PASS[$arm_idx]="$ALL_PASS"
+  SUM=""
+  for line in "${RUN_RESULTS[@]}"; do
+    read -r idx correct errors <<< "$line"
+    SUM+="${correct},"
+  done
+  ARM_GIT_SUMMARY[$arm_idx]="${SUM%,}"
+
+  log "<== arm=$ARM complete (gate_pass=$ALL_PASS, trace_lines=$TRACE_LINES, sentinel_check=$SENTINEL_CHECK_LINES, first_fire=$FIRST_FIRE_LINES, runs=$SUM)"
+done
+
+# ---------------------------------------------------------------------------
+# Aggregate comparison.json
+# ---------------------------------------------------------------------------
+{
+  echo "{"
+  echo "  \"out_dir\": \"$OUT_DIR\","
+  echo "  \"git_sha\": \"$GIT_SHA\","
+  echo "  \"image\": \"$NVLLM_IMAGE\","
+  echo "  \"n_runs\": $N_RUNS,"
+  echo "  \"gsm8k_floor\": $GSM8K_FLOOR,"
+  echo "  \"patched_repo\": \"$PATCHED_REPO\","
+  echo "  \"sentinels_root\": \"$SENTINELS_ROOT\","
+  echo "  \"arms\": ["
+  first=1
+  for arm_idx in "${!ARM_NAMES[@]}"; do
+    ARM="${ARM_NAMES[$arm_idx]}"
+    [ "$first" -eq 0 ] && echo "," || true
+    first=0
+    GP="${ARM_GATE_PASS[$arm_idx]:-unknown}"
+    SUM="${ARM_GIT_SUMMARY[$arm_idx]:-unknown}"
+    echo -n "    {\"arm\": \"$ARM\", \"ssm_sentinel\": ${ARM_SSM[$ARM]}, \"kv_sentinel\": ${ARM_KV[$ARM]}, \"gate_pass\": \"$GP\", \"correct_per_run\": \"$SUM\", \"verdict\": \"$OUT_DIR/$ARM/verdict.json\"}"
+  done
+  echo
+  echo "  ]"
+  echo "}"
+} > "$OUT_DIR/comparison.json"
+
+log "ablation suite complete"
+log "comparison: $OUT_DIR/comparison.json"
+exit 0
diff --git a/scripts/ablation/ssm_ablation_compare.py b/scripts/ablation/ssm_ablation_compare.py
new file mode 100755
index 000000000000..3e6920e22f85
--- /dev/null
+++ b/scripts/ablation/ssm_ablation_compare.py
@@ -0,0 +1,423 @@
+"""
+SSM zero-on-realloc ablation comparison tool.
+
+Reads each arm's verdict.json + per-question JSONL trace, emits a markdown
+comparison at <OUT_DIR>/ANALYSIS.md with:
+  - Verdict table (arm x run x correct/errors/gate_pass)
+  - Q1-Q50 per-question table for Run 4 (collapse window) across all 4 arms:
+    latency_s, completion_tokens, decode_tok_s, finish_reason, correct
+  - Aggregate per-arm steady-state stats: median decode_tok_s, p95 latency,
+    mean completion_tokens
+  - Friend's interpretation thresholds applied: which arm matches "real
+    pipeline win" vs "shortened generations"
+  - Drained KV invariant check from /metrics pre vs post snapshot.
+
+Usage:
+    python3 /tmp/ssm_ablation_compare.py [OUT_DIR]
+
+Default OUT_DIR: /tmp/ssm_ablation_suite
+
+Reads:
+    <OUT_DIR>/<arm>/verdict.json
+    <OUT_DIR>/<arm>/perq.jsonl            (concatenated by runner)
+    <OUT_DIR>/<arm>/run<i>/perq.jsonl     (per-run, used for Run-4 table)
+    <OUT_DIR>/<arm>/run<i>/metrics_*.json (pre / q10..q50 / post snapshots)
+
+Writes:
+    <OUT_DIR>/ANALYSIS.md
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import statistics
+import sys
+from typing import Any
+
+# Arm presentation order = same as the runner.
+ARM_ORDER = ("both", "neither", "ssm_only", "kv_only")
+RUN_INDICES = (1, 2, 3, 4, 5)
+COLLAPSE_RUN = 4  # the friend's collapse-window pin
+METRIC_KEY_KV_USAGE = "vllm:kv_cache_usage_perc"
+METRIC_KEY_KV_USAGE_TOL = 0.05  # 5 percentage-point tolerance for "drained"
+
+# Friend's interpretation thresholds.
+# - "real pipeline win" = decode_tok_s materially higher AND completion_tokens
+#   not shortened relative to neither/baseline.
+# - "shortened generations" = decode_tok_s higher BUT mean completion_tokens
+#   notably lower (typical max_tokens=512 with finish_reason=length disappearing).
+TPOT_WIN_RATIO = 1.30          # >=30% decode_tok_s vs baseline = "win"
+SHORTEN_RATIO = 0.85           # <=85% of baseline mean completion_tokens = "shortened"
+
+
+def _load_json(path: str) -> Any:
+    try:
+        with open(path) as f:
+            return json.load(f)
+    except Exception:
+        return None
+
+
+def _load_jsonl(path: str) -> list[dict]:
+    out: list[dict] = []
+    try:
+        with open(path) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                try:
+                    out.append(json.loads(line))
+                except Exception:
+                    continue
+    except FileNotFoundError:
+        pass
+    return out
+
+
+def _p95(values: list[float]) -> float:
+    if not values:
+        return 0.0
+    s = sorted(values)
+    idx = max(0, int(round(0.95 * (len(s) - 1))))
+    return s[idx]
+
+
+def _fmt(v: Any, prec: int = 2) -> str:
+    if v is None:
+        return "-"
+    if isinstance(v, float):
+        return f"{v:.{prec}f}"
+    return str(v)
+
+
+def _arm_stats(perq_records: list[dict]) -> dict:
+    if not perq_records:
+        return {
+            "n_questions": 0,
+            "median_decode_tok_s": 0.0,
+            "p95_wall_time_s": 0.0,
+            "mean_completion_tokens": 0.0,
+            "finish_reason_counts": {},
+        }
+    decode_rates = [
+        float(r.get("decode_tok_s", 0) or 0)
+        for r in perq_records
+        if float(r.get("decode_tok_s", 0) or 0) > 0
+    ]
+    walls = [
+        float(r.get("wall_time_s", 0) or 0)
+        for r in perq_records
+        if float(r.get("wall_time_s", 0) or 0) > 0
+    ]
+    comp_tokens = [int(r.get("completion_tokens", 0) or 0) for r in perq_records]
+    finish_reasons: dict[str, int] = {}
+    for r in perq_records:
+        fr = str(r.get("finish_reason"))
+        finish_reasons[fr] = finish_reasons.get(fr, 0) + 1
+    return {
+        "n_questions": len(perq_records),
+        "median_decode_tok_s": (statistics.median(decode_rates) if decode_rates else 0.0),
+        "p95_wall_time_s": _p95(walls),
+        "mean_completion_tokens": (statistics.mean(comp_tokens) if comp_tokens else 0.0),
+        "finish_reason_counts": finish_reasons,
+    }
+
+
+def _drained_invariant(metric_pre: dict, metric_post: dict) -> dict:
+    """Did KV usage % return to baseline at the post snapshot?
+
+    Returns dict with pre, post, delta_pp, drained (bool).
+    """
+    if not isinstance(metric_pre, dict) or not isinstance(metric_post, dict):
+        return {"pre": None, "post": None, "delta_pp": None, "drained": None}
+    m_pre = (metric_pre or {}).get("metrics", {}) or {}
+    m_post = (metric_post or {}).get("metrics", {}) or {}
+    pre_val = m_pre.get(METRIC_KEY_KV_USAGE)
+    post_val = m_post.get(METRIC_KEY_KV_USAGE)
+    if pre_val is None or post_val is None:
+        return {"pre": pre_val, "post": post_val, "delta_pp": None, "drained": None}
+    try:
+        delta = float(post_val) - float(pre_val)
+    except (TypeError, ValueError):
+        return {"pre": pre_val, "post": post_val, "delta_pp": None, "drained": None}
+    return {
+        "pre": float(pre_val),
+        "post": float(post_val),
+        "delta_pp": delta,
+        "drained": abs(delta) <= METRIC_KEY_KV_USAGE_TOL,
+    }
+
+
+def _load_arm(out_dir: str, arm: str) -> dict:
+    arm_dir = os.path.join(out_dir, arm)
+    verdict = _load_json(os.path.join(arm_dir, "verdict.json")) or {}
+    perq_concat_path = os.path.join(arm_dir, "perq.jsonl")
+    perq_concat = _load_jsonl(perq_concat_path)
+
+    # Per-run breakouts.
+    runs: dict[int, dict] = {}
+    for run_idx in RUN_INDICES:
+        run_dir = os.path.join(arm_dir, f"run{run_idx}")
+        gsm = _load_json(os.path.join(run_dir, "gsm8k.json"))
+        perq = _load_jsonl(os.path.join(run_dir, "perq.jsonl"))
+        runs[run_idx] = {"gsm8k": gsm, "perq": perq, "dir": run_dir}
+
+    # /metrics snapshots: typically Run 1 holds the pre/post pair. We aggregate
+    # from each run's own pre/post if they exist (the eval writes them next to
+    # the run's gsm8k.json).
+    arm_metrics = {}
+    for run_idx in RUN_INDICES:
+        run_dir = runs[run_idx]["dir"]
+        arm_metrics[run_idx] = {
+            "pre": _load_json(os.path.join(run_dir, "metrics_pre.json")),
+            "post": _load_json(os.path.join(run_dir, "metrics_post.json")),
+        }
+        for tag in ("q10", "q20", "q30", "q40", "q50"):
+            snap = _load_json(os.path.join(run_dir, f"metrics_{tag}.json"))
+            if snap is not None:
+                arm_metrics[run_idx][tag] = snap
+
+    return {
+        "verdict": verdict,
+        "perq_concat": perq_concat,
+        "runs": runs,
+        "metrics": arm_metrics,
+    }
+
+
+def _interpretation(stats_by_arm: dict[str, dict]) -> dict[str, str]:
+    """Apply friend's win/shortened thresholds, with 'neither' as baseline."""
+    out: dict[str, str] = {}
+    baseline = stats_by_arm.get("neither", {})
+    base_decode = float(baseline.get("median_decode_tok_s") or 0.0) or None
+    base_compt = float(baseline.get("mean_completion_tokens") or 0.0) or None
+
+    for arm in ARM_ORDER:
+        s = stats_by_arm.get(arm, {})
+        decode = float(s.get("median_decode_tok_s") or 0.0)
+        compt = float(s.get("mean_completion_tokens") or 0.0)
+        if base_decode is None or base_compt is None or arm == "neither":
+            verdict = "baseline" if arm == "neither" else "no baseline available"
+            out[arm] = verdict
+            continue
+        decode_ratio = decode / base_decode if base_decode > 0 else 0.0
+        compt_ratio = compt / base_compt if base_compt > 0 else 0.0
+        if decode_ratio >= TPOT_WIN_RATIO and compt_ratio >= SHORTEN_RATIO:
+            verdict = (
+                f"REAL pipeline win "
+                f"(decode {decode_ratio:.2f}x baseline, compt {compt_ratio:.2f}x)"
+            )
+        elif decode_ratio >= TPOT_WIN_RATIO and compt_ratio < SHORTEN_RATIO:
+            verdict = (
+                f"SHORTENED generations - speed inflated "
+                f"(decode {decode_ratio:.2f}x, compt only {compt_ratio:.2f}x)"
+            )
+        elif decode_ratio < TPOT_WIN_RATIO and compt_ratio >= SHORTEN_RATIO:
+            verdict = (
+                f"no decode win vs baseline "
+                f"(decode {decode_ratio:.2f}x, compt {compt_ratio:.2f}x)"
+            )
+        else:
+            verdict = (
+                f"no win + shortened "
+                f"(decode {decode_ratio:.2f}x, compt {compt_ratio:.2f}x)"
+            )
+        out[arm] = verdict
+    return out
+
+
+def _render_verdict_table(arms_data: dict[str, dict]) -> str:
+    """Verdict table: arm x run x correct/errors/gate_pass."""
+    lines: list[str] = []
+    lines.append("| Arm | SSM | KV | Run 1 | Run 2 | Run 3 | Run 4 | Run 5 | Gate |")
+    lines.append("|-----|-----|----|-------|-------|-------|-------|-------|------|")
+    for arm in ARM_ORDER:
+        d = arms_data.get(arm, {})
+        v = d.get("verdict", {})
+        ssm = v.get("ssm_zero_on_realloc", v.get("ssm_sentinel", "?"))
+        kv = v.get("kv_zero_for_mamba_ids", v.get("kv_sentinel", "?"))
+        gate = v.get("gate_pass", "?")
+        cells: list[str] = []
+        runs_arr = v.get("runs") or []
+        run_by_idx = {int(r.get("run", -1)): r for r in runs_arr if isinstance(r, dict)}
+        for run_idx in RUN_INDICES:
+            r = run_by_idx.get(run_idx)
+            if r is None:
+                cells.append("-")
+                continue
+            if "correct" not in r:
+                cells.append(f"FAIL({r.get('reason', '?')})")
+                continue
+            cells.append(f"{r['correct']}/{r.get('errors', 0)}err")
+        lines.append(
+            "| " + " | ".join([arm, str(ssm), str(kv), *cells, str(gate)]) + " |"
+        )
+    return "\n".join(lines)
+
+
+def _render_run_table(arms_data: dict[str, dict], run_idx: int) -> str:
+    """Per-question table for one run across all 4 arms. Columns:
+    Q | <arm>:lat | <arm>:ct | <arm>:dtok/s | <arm>:fr | <arm>:ok
+    """
+    arm_perq: dict[str, list[dict]] = {
+        arm: arms_data.get(arm, {}).get("runs", {}).get(run_idx, {}).get("perq") or []
+        for arm in ARM_ORDER
+    }
+    # Build index by prompt_index per arm.
+    indexed: dict[str, dict[int, dict]] = {
+        arm: {int(r.get("prompt_index", -1)): r for r in arm_perq[arm]}
+        for arm in ARM_ORDER
+    }
+    # Union of seen prompt indices, sorted.
+    all_qs: set[int] = set()
+    for arm in ARM_ORDER:
+        all_qs.update(indexed[arm].keys())
+    if not all_qs:
+        return "_(no per-Q records for run "f"{run_idx}_)"
+
+    header_cells = ["Q"]
+    for arm in ARM_ORDER:
+        header_cells += [
+            f"{arm}:lat", f"{arm}:ct", f"{arm}:dtok/s", f"{arm}:fr", f"{arm}:ok"
+        ]
+    lines: list[str] = []
+    lines.append("| " + " | ".join(header_cells) + " |")
+    lines.append("|" + "|".join(["---"] * len(header_cells)) + "|")
+    for q in sorted(all_qs):
+        row = [str(q)]
+        for arm in ARM_ORDER:
+            r = indexed[arm].get(q)
+            if r is None:
+                row += ["-", "-", "-", "-", "-"]
+                continue
+            row += [
+                _fmt(r.get("wall_time_s"), 2),
+                _fmt(r.get("completion_tokens"), 0),
+                _fmt(r.get("decode_tok_s"), 2),
+                _fmt(r.get("finish_reason")),
+                "Y" if r.get("correct") else "N",
+            ]
+        lines.append("| " + " | ".join(row) + " |")
+    return "\n".join(lines)
+
+
+def _render_steady_state(stats_by_arm: dict[str, dict]) -> str:
+    """Aggregate per-arm steady-state stats."""
+    lines: list[str] = []
+    lines.append(
+        "| Arm | N | median dtok/s | p95 wall_s | mean completion_tokens | finish_reason hist |"
+    )
+    lines.append("|-----|---|---------------|------------|------------------------|--------------------|")
+    for arm in ARM_ORDER:
+        s = stats_by_arm.get(arm, {})
+        fr = s.get("finish_reason_counts", {}) or {}
+        fr_str = ", ".join(f"{k}={v}" for k, v in sorted(fr.items()))
+        lines.append(
+            f"| {arm} | {s.get('n_questions', 0)} | "
+            f"{_fmt(s.get('median_decode_tok_s'), 2)} | "
+            f"{_fmt(s.get('p95_wall_time_s'), 2)} | "
+            f"{_fmt(s.get('mean_completion_tokens'), 1)} | {fr_str} |"
+        )
+    return "\n".join(lines)
+
+
+def _render_drained_section(arms_data: dict[str, dict]) -> str:
+    lines: list[str] = []
+    lines.append(
+        f"Tolerance: |delta| <= {METRIC_KEY_KV_USAGE_TOL:.2f} (5 pp) "
+        f"counts as drained."
+    )
+    lines.append("")
+    lines.append("| Arm | Run | KV pre | KV post | delta | drained |")
+    lines.append("|-----|-----|--------|---------|-------|---------|")
+    for arm in ARM_ORDER:
+        d = arms_data.get(arm, {})
+        metrics = d.get("metrics", {})
+        for run_idx in RUN_INDICES:
+            m = metrics.get(run_idx, {})
+            di = _drained_invariant(m.get("pre"), m.get("post"))
+            drained = di.get("drained")
+            drained_s = "-" if drained is None else ("Y" if drained else "N")
+            lines.append(
+                f"| {arm} | {run_idx} | {_fmt(di.get('pre'), 4)} | "
+                f"{_fmt(di.get('post'), 4)} | {_fmt(di.get('delta_pp'), 4)} | "
+                f"{drained_s} |"
+            )
+    return "\n".join(lines)
+
+
+def main():
+    out_dir = sys.argv[1] if len(sys.argv) > 1 else "/tmp/ssm_ablation_suite"
+    if not os.path.isdir(out_dir):
+        print(f"ERROR: OUT_DIR not found: {out_dir}", file=sys.stderr)
+        sys.exit(2)
+
+    arms_data: dict[str, dict] = {
+        arm: _load_arm(out_dir, arm) for arm in ARM_ORDER
+    }
+    # Steady-state stats per arm computed on perq concat.
+    stats_by_arm = {
+        arm: _arm_stats(arms_data[arm].get("perq_concat") or [])
+        for arm in ARM_ORDER
+    }
+    interp = _interpretation(stats_by_arm)
+
+    md: list[str] = []
+    md.append("# SSM zero-on-realloc ablation: 4-arm comparison")
+    md.append("")
+    md.append(f"- OUT_DIR: `{out_dir}`")
+    comp = _load_json(os.path.join(out_dir, "comparison.json")) or {}
+    md.append(f"- git_sha: `{comp.get('git_sha', '?')}`")
+    md.append(f"- image: `{comp.get('image', '?')}`")
+    md.append(f"- N runs per arm: {comp.get('n_runs', '?')}")
+    md.append(f"- gsm8k_floor: {comp.get('gsm8k_floor', '?')}")
+    md.append("")
+    md.append("## Verdict table (run x correct/errors)")
+    md.append("")
+    md.append(_render_verdict_table(arms_data))
+    md.append("")
+    md.append(
+        f"## Per-question table - Run {COLLAPSE_RUN} (collapse window)"
+    )
+    md.append("")
+    md.append(
+        "Columns per arm: lat (wall_time_s), ct (completion_tokens), "
+        "dtok/s (decode_tok_s), fr (finish_reason), ok (correct)."
+    )
+    md.append("")
+    md.append(_render_run_table(arms_data, COLLAPSE_RUN))
+    md.append("")
+    md.append("## Aggregate per-arm steady-state stats (concat across runs)")
+    md.append("")
+    md.append(_render_steady_state(stats_by_arm))
+    md.append("")
+    md.append("## Friend's interpretation thresholds applied")
+    md.append("")
+    md.append(
+        f"- 'real pipeline win' iff median decode_tok_s >= "
+        f"{TPOT_WIN_RATIO:.2f}x baseline ('neither') AND mean completion_tokens "
+        f">= {SHORTEN_RATIO:.2f}x baseline"
+    )
+    md.append(
+        f"- 'shortened generations' iff decode rate up but completion_tokens "
+        f"< {SHORTEN_RATIO:.2f}x baseline"
+    )
+    md.append("")
+    for arm in ARM_ORDER:
+        md.append(f"- **{arm}**: {interp.get(arm, '-')}")
+    md.append("")
+    md.append("## Drained KV invariant (per-run pre vs post)")
+    md.append("")
+    md.append(_render_drained_section(arms_data))
+    md.append("")
+
+    out_md = os.path.join(out_dir, "ANALYSIS.md")
+    with open(out_md, "w") as f:
+        f.write("\n".join(md) + "\n")
+    print(out_md)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/ablation/ssm_sentinel_overlay.patch b/scripts/ablation/ssm_sentinel_overlay.patch
new file mode 100644
index 000000000000..005d17eceb02
--- /dev/null
+++ b/scripts/ablation/ssm_sentinel_overlay.patch
@@ -0,0 +1,248 @@
+diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py
+index fa5395685..c2f0663ac 100644
+--- a/vllm/v1/core/single_type_kv_cache_manager.py
++++ b/vllm/v1/core/single_type_kv_cache_manager.py
+@@ -1,10 +1,14 @@
+ # SPDX-License-Identifier: Apache-2.0
+ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+ import itertools
++import json
++import os
++import time
+ from abc import ABC, abstractmethod
+ from collections import defaultdict
+ from collections.abc import Sequence
+ 
++from vllm.logger import init_logger
+ from vllm.utils.math_utils import cdiv
+ from vllm.v1.core.block_pool import BlockPool
+ from vllm.v1.core.kv_cache_utils import (
+@@ -24,6 +28,77 @@ from vllm.v1.kv_cache_interface import (
+ )
+ from vllm.v1.request import Request
+ 
++logger = init_logger(__name__)
++
++# --- nvllm KV-zero-for-mamba-ids ablation knob (sentinel-file gated) --------
++# Env vars get stripped by vLLM EngineCore subprocess spawn
++# (memory:feedback_vllm_enginecore_env_strip), so the gate is a filesystem
++# sentinel bind-mounted :ro into the container. Path is hardcoded; do NOT
++# read from env. Default OFF (no sentinel -> unpatched behavior).
++#
++# Sentinel:  /run/nvllm/kv_zero_for_mamba_ids.enabled
++# Effect:    relax the new_block_ids gate so MambaSpec allocations also push
++#            onto the new_block_ids channel (both in the base manager and in
++#            MambaManager.allocate_new_blocks align branch).
++# (Companion SSM zero-on-realloc gate lives in worker/utils.py.)
++from pathlib import Path as _Path
++_KV_ZERO_SENTINEL = _Path("/run/nvllm/kv_zero_for_mamba_ids.enabled")
++_KV_ZERO_CACHED: bool | None = None
++_KV_ZERO_FIRE_COUNT = 0
++
++
++def _kv_zero_for_mamba_ids_enabled() -> bool:
++    """Cache-after-first-check gate. Logs sentinel_check on first call."""
++    global _KV_ZERO_CACHED
++    if _KV_ZERO_CACHED is None:
++        exists = _KV_ZERO_SENTINEL.exists()
++        _KV_ZERO_CACHED = exists
++        logger.info(
++            "nvllm.ablation.sentinel_check name=kv_zero_for_mamba_ids "
++            "path=%s exists=%s enabled=%s",
++            _KV_ZERO_SENTINEL, exists, exists,
++        )
++    return _KV_ZERO_CACHED
++
++
++def _kv_zero_for_mamba_ids_fired(n_block_ids: int) -> None:
++    """Increment fire counter; log first_fire (count==1) and every 100th."""
++    global _KV_ZERO_FIRE_COUNT
++    _KV_ZERO_FIRE_COUNT += 1
++    if _KV_ZERO_FIRE_COUNT == 1:
++        logger.info(
++            "nvllm.ablation.first_fire name=kv_zero_for_mamba_ids "
++            "n_block_ids=%d",
++            n_block_ids,
++        )
++    elif _KV_ZERO_FIRE_COUNT % 100 == 0:
++        logger.info(
++            "nvllm.ablation.fire_count name=kv_zero_for_mamba_ids count=%d",
++            _KV_ZERO_FIRE_COUNT,
++        )
++
++
++# --- nvllm mamba slot tracer (env-gated JSONL diagnostic) -------------------
++# NVLLM_MAMBA_SLOT_TRACE=/path/to.jsonl enables JSONL append at alloc_align +
++# free sites; used by the ablation harness to confirm lifecycle events.
++_MAMBA_TRACE_PATH = os.environ.get("NVLLM_MAMBA_SLOT_TRACE", "")
++_MAMBA_TRACE_FH = None
++
++
++def _mamba_trace(record: dict) -> None:
++    """Append one JSONL record to NVLLM_MAMBA_SLOT_TRACE (no-op if unset)."""
++    global _MAMBA_TRACE_FH
++    if not _MAMBA_TRACE_PATH:
++        return
++    try:
++        if _MAMBA_TRACE_FH is None:
++            _MAMBA_TRACE_FH = open(_MAMBA_TRACE_PATH, "a", buffering=1)
++        record.setdefault("ts", time.time())
++        _MAMBA_TRACE_FH.write(json.dumps(record, separators=(",", ":")) + "\n")
++    except Exception:
++        # Tracing must never break serving.
++        pass
++
+ 
+ class SingleTypeKVCacheManager(ABC):
+     """
+@@ -209,8 +284,15 @@ class SingleTypeKVCacheManager(ABC):
+                 cdiv(num_total_computed_tokens, self.block_size) - len(req_blocks)
+             )
+             req_blocks.extend(allocated_blocks)
+-            if type(self.kv_cache_spec) is FullAttentionSpec:
++            _is_full = type(self.kv_cache_spec) is FullAttentionSpec
++            _is_mamba_gated = (
++                type(self.kv_cache_spec) is MambaSpec
++                and _kv_zero_for_mamba_ids_enabled()
++            )
++            if _is_full or _is_mamba_gated:
+                 self.new_block_ids.extend(b.block_id for b in allocated_blocks)
++                if _is_mamba_gated:
++                    _kv_zero_for_mamba_ids_fired(len(allocated_blocks))
+ 
+     def allocate_new_blocks(
+         self, request_id: str, num_tokens: int, num_tokens_main_model: int
+@@ -237,8 +319,15 @@ class SingleTypeKVCacheManager(ABC):
+         else:
+             new_blocks = self.block_pool.get_new_blocks(num_new_blocks)
+             req_blocks.extend(new_blocks)
+-            if type(self.kv_cache_spec) is FullAttentionSpec:
++            _is_full = type(self.kv_cache_spec) is FullAttentionSpec
++            _is_mamba_gated = (
++                type(self.kv_cache_spec) is MambaSpec
++                and _kv_zero_for_mamba_ids_enabled()
++            )
++            if _is_full or _is_mamba_gated:
+                 self.new_block_ids.extend(b.block_id for b in new_blocks)
++                if _is_mamba_gated:
++                    _kv_zero_for_mamba_ids_fired(len(new_blocks))
+             return new_blocks
+ 
+     def take_new_block_ids(self) -> list[int]:
+@@ -1005,10 +1094,37 @@ class MambaManager(SingleTypeKVCacheManager):
+                     assert num_new_blocks <= self.num_speculative_blocks + 1
+                 new_blocks = self.block_pool.get_new_blocks(num_new_blocks)
+                 req_blocks.extend(new_blocks)
++                # Mirror SingleTypeKVCacheManager.allocate_new_blocks: push the
++                # freshly-allocated block IDs onto the new_block_ids channel
++                # so the worker-side zero-on-realloc pass clears recycled
++                # mamba conv_state / ssm_state slots before the next prefill
++                # writes into them. Skip null blocks. Sentinel-gated.
++                if _kv_zero_for_mamba_ids_enabled():
++                    _new_ids = [b.block_id for b in new_blocks if not b.is_null]
++                    self.new_block_ids.extend(_new_ids)
++                    if _new_ids:
++                        _kv_zero_for_mamba_ids_fired(len(_new_ids))
+                 self._allocated_block_reqs.add(request_id)
++                _mamba_trace({
++                    "op": "alloc_align",
++                    "request_id": request_id,
++                    "block_ids": [b.block_id for b in new_blocks],
++                })
+                 return req_blocks[prev_block_len:]
+ 
+     def free(self, request_id: str) -> None:
++        # Snapshot block IDs for the trace BEFORE super().free() clears them.
++        if _MAMBA_TRACE_PATH:
++            freed_ids = [
++                b.block_id
++                for b in self.req_to_blocks.get(request_id, ())
++                if not b.is_null
++            ]
++            _mamba_trace({
++                "op": "free",
++                "request_id": request_id,
++                "block_ids": freed_ids,
++            })
+         if self.mamba_cache_mode == "align":
+             self._allocated_block_reqs.discard(request_id)
+             self.last_state_block_idx.pop(request_id, None)
+diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
+index 0335d5e0e..270b112d7 100644
+--- a/vllm/v1/worker/utils.py
++++ b/vllm/v1/worker/utils.py
+@@ -77,25 +77,49 @@ def _zero_kv_blocks_kernel(
+     tl.store(ptr + offset + cols, tl.zeros([BLOCK_SIZE], dtype=tl.int32))
+ 
+ 
+-# Ops-visibility counter for the SSM zero-on-realloc pass. Logged at first
+-# fire (count==1) and every 100th fire thereafter. Module-global; survives
+-# across requests within a worker process. EngineCore strips env vars on
+-# subprocess spawn, so this is intentionally not env-gated.
++# --- nvllm SSM zero-on-realloc ablation knob (sentinel-file gated) ----------
++# Env vars get stripped by vLLM EngineCore subprocess spawn
++# (memory:feedback_vllm_enginecore_env_strip), so the gate is a filesystem
++# sentinel bind-mounted :ro into the container. Path is hardcoded; do NOT
++# read from env. Default OFF (no sentinel -> unpatched behavior).
++#
++# Sentinel:  /run/nvllm/zero_ssm_on_realloc.enabled
++# Effect:    KVBlockZeroer.zero_block_ids walks the sister MambaBlockZeroer
++#            on the same block-ID list. With the sentinel ABSENT the call is
++#            skipped entirely (gate returns False).
++from pathlib import Path as _Path
++_SSM_ZERO_SENTINEL = _Path("/run/nvllm/zero_ssm_on_realloc.enabled")
++_SSM_ZERO_CACHED: bool | None = None
+ _SSM_ZERO_FIRE_COUNT = 0
+ 
+ 
++def _ssm_zero_on_realloc_enabled() -> bool:
++    """Cache-after-first-check gate. Logs sentinel_check on first call."""
++    global _SSM_ZERO_CACHED
++    if _SSM_ZERO_CACHED is None:
++        exists = _SSM_ZERO_SENTINEL.exists()
++        _SSM_ZERO_CACHED = exists
++        logger.info(
++            "nvllm.ablation.sentinel_check name=ssm_zero_on_realloc "
++            "path=%s exists=%s enabled=%s",
++            _SSM_ZERO_SENTINEL, exists, exists,
++        )
++    return _SSM_ZERO_CACHED
++
++
+ def _ssm_zero_fired(n_block_ids: int) -> None:
+-    """Increment fire counter and log at first fire + every 100th."""
++    """Increment fire counter; log first_fire (count==1) and every 100th."""
+     global _SSM_ZERO_FIRE_COUNT
+     _SSM_ZERO_FIRE_COUNT += 1
+     if _SSM_ZERO_FIRE_COUNT == 1:
+         logger.info(
+-            "nvllm.ssm_zero_on_realloc.first_fire n_block_ids=%d",
++            "nvllm.ablation.first_fire name=ssm_zero_on_realloc "
++            "n_block_ids=%d",
+             n_block_ids,
+         )
+     elif _SSM_ZERO_FIRE_COUNT % 100 == 0:
+         logger.info(
+-            "nvllm.ssm_zero_on_realloc.fire_count count=%d",
++            "nvllm.ablation.fire_count name=ssm_zero_on_realloc count=%d",
+             _SSM_ZERO_FIRE_COUNT,
+         )
+ 
+@@ -293,9 +317,14 @@ class KVBlockZeroer:
+         """Zero the KV cache memory for the given block IDs."""
+         # Mamba layers are zeroed via PyTorch index_fill_; the sister zeroer
+         # tolerates the full block-ID list because it filters per-tensor by
+-        # leading dim. Runs unconditionally so recycled conv_state / ssm_state
+-        # rows are cleared before the next prefill writes into them.
+-        if block_ids and self._mamba_zeroer is not None:
++        # leading dim. Sentinel-gated: /run/nvllm/zero_ssm_on_realloc.enabled
++        # present -> on. Without the sentinel the production path is skipped
++        # for ablation A/B vs unpatched baseline.
++        if (
++            block_ids
++            and self._mamba_zeroer is not None
++            and _ssm_zero_on_realloc_enabled()
++        ):
+             self._mamba_zeroer.zero_block_ids(block_ids)
+             _ssm_zero_fired(len(block_ids))
+         if not block_ids or self._meta is None:
diff --git a/scripts/gsm8k_eval_50.py b/scripts/gsm8k_eval_50.py
index 9f239a414488..b0eeb48dac83 100644
--- a/scripts/gsm8k_eval_50.py
+++ b/scripts/gsm8k_eval_50.py
@@ -2,19 +2,34 @@
 GSM8K 50-question random eval against a vLLM server.
 
 Reads cached HF gsm8k test parquet (1319 questions), samples N with a fixed
-seed (default 50, seed=42 — reproducible), sends each to /v1/completions at
+seed (default 50, seed=42 - reproducible), sends each to /v1/completions at
 temperature=0, parses final numeric answer.
 
 Per memory:feedback_eval_completions: /v1/completions, NOT /v1/chat/completions.
 
+Instrumented form (2026-05-15) for the SSM zero-on-realloc ablation suite:
+  - per-question JSONL trace at <output-dir>/perq.jsonl (one record per Q)
+  - --run-index flag, stamped into every per-Q record
+  - usage tokens (prompt/completion/total) + decode_tok/s + finish_reason
+  - output sha256 (16 hex), request id, character count
+  - --metrics-url flag: snapshots vllm:* prometheus metrics at pre/q10/q20/
+    q30/q40/q50/post tags, saved to <output-dir>/metrics_<tag>.json
+
+Timing semantics PRESERVED: wall_time_s is the time from the
+requests.post() start to response received, EXCLUDING metrics-snapshot time.
+
 Usage:
     .venv/bin/python scripts/gsm8k_eval_50.py \\
         --api http://localhost:8000/v1 --model default \\
-        --n 50 --save out.json --label some_run_name
+        --n 50 --save out.json --label some_run_name \\
+        --run-index 1 \\
+        --metrics-url http://localhost:8000/metrics
 """
 
 import argparse
+import hashlib
 import json
+import os
 import re
 import sys
 import time
@@ -33,9 +48,17 @@
     "ee7b8da9e381df27b9e3f7758a159ab2bdaa4dbaa910546cbbc47e0cb44e4f59"
 )
 
+# Subset of /metrics lines we extract into the per-snapshot JSON.
+METRICS_KEYS = (
+    "vllm:num_requests_running",
+    "vllm:num_requests_waiting",
+    "vllm:kv_cache_usage_perc",
+    "vllm:generation_tokens_total",
+    "vllm:num_preemptions_total",
+)
+
 
 def _load_test_split():
-    import os
     if os.path.exists(GSM8K_TEST_ARROW):
         with pa.memory_map(GSM8K_TEST_ARROW, "rb") as src:
             return ipc.open_stream(src).read_all().to_pylist()
@@ -65,6 +88,60 @@ def normalize(s: str) -> str:
         return s.strip()
 
 
+def _snapshot_metrics(metrics_url, tag, perq_dir):
+    """Fetch /metrics, extract METRICS_KEYS, write metrics_<tag>.json.
+
+    Best-effort: failures never abort the eval; returns a dict on success
+    or None on failure. Called outside the wall_time_s timer.
+    """
+    if not metrics_url or not perq_dir:
+        return None
+    try:
+        r = requests.get(metrics_url, timeout=10)
+        r.raise_for_status()
+        body = r.text
+    except Exception as e:
+        snap = {"tag": tag, "ts": time.time(), "error": repr(e)}
+        try:
+            with open(os.path.join(perq_dir, f"metrics_{tag}.json"), "w") as f:
+                json.dump(snap, f, indent=2)
+        except Exception:
+            pass
+        return snap
+
+    # Prometheus text format: each line starts with the metric name
+    # (possibly with {labels}) and a value. We extract the LAST numeric
+    # value seen for each desired key (sum across labels for gauges or
+    # final total for counters; both behaviors are acceptable here since
+    # we mostly care about deltas between snapshots).
+    values: dict = {}
+    for line in body.splitlines():
+        if not line or line.startswith("#"):
+            continue
+        # Match "metric_name" or "metric_name{labels}"
+        for key in METRICS_KEYS:
+            if line.startswith(key + " ") or line.startswith(key + "{"):
+                # split on whitespace from the right: "<name>{...} <value>"
+                parts = line.rsplit(None, 1)
+                if len(parts) != 2:
+                    continue
+                try:
+                    v = float(parts[1])
+                except ValueError:
+                    continue
+                # Sum across label sets (gauges total across engines; counters
+                # already monotonic, so summing engine_idx labels is correct).
+                values[key] = values.get(key, 0.0) + v
+                break
+    snap = {"tag": tag, "ts": time.time(), "metrics": values}
+    try:
+        with open(os.path.join(perq_dir, f"metrics_{tag}.json"), "w") as f:
+            json.dump(snap, f, indent=2)
+    except Exception:
+        pass
+    return snap
+
+
 def main():
     ap = argparse.ArgumentParser()
     ap.add_argument("--api", default="http://localhost:8000/v1")
@@ -75,14 +152,39 @@ def main():
     ap.add_argument("--timeout", type=int, default=180)
     ap.add_argument("--label", default="gsm8k_50")
     ap.add_argument("--save", default=None)
+    # Instrumented additions:
+    ap.add_argument(
+        "--run-index", type=int, default=0,
+        help="Soak run index, stamped into every per-Q JSONL record",
+    )
+    ap.add_argument(
+        "--metrics-url", default=None,
+        help="If set, snapshot /metrics pre / q10..q50 / post into "
+             "<output-dir>/metrics_<tag>.json (timing NOT charged to wall).",
+    )
     args = ap.parse_args()
 
+    # perq_dir = directory holding gsm8k.json (i.e. <output-dir>)
+    perq_dir = None
+    perq_fh = None
+    if args.save:
+        perq_dir = os.path.dirname(os.path.abspath(args.save)) or "."
+        try:
+            os.makedirs(perq_dir, exist_ok=True)
+            perq_fh = open(os.path.join(perq_dir, "perq.jsonl"), "a", buffering=1)
+        except Exception as e:
+            sys.stderr.write(f"WARN: cannot open perq.jsonl: {e}\n")
+            perq_fh = None
+
     table = _load_test_split()
 
     import random
     rng = random.Random(args.seed)
     sample = rng.sample(table, args.n)
 
+    # Pre-flight metrics snapshot (NOT charged to any question's wall time).
+    _snapshot_metrics(args.metrics_url, "pre", perq_dir)
+
     results = []
     correct = 0
     errors = 0
@@ -99,13 +201,25 @@ def main():
             "temperature": 0.0,
             "stop": ["\nQ:", "\nQuestion:"],
         }
+
+        # Per-question instrumentation defaults (filled in on success).
+        usage = {}
+        finish_reason = None
+        request_id = None
+        text = ""
+
         ts = time.time()
         try:
             r = requests.post(
                 f"{args.api}/completions", json=body, timeout=args.timeout
             )
             r.raise_for_status()
-            text = r.json()["choices"][0]["text"]
+            payload = r.json()
+            choice0 = payload.get("choices", [{}])[0]
+            text = choice0.get("text", "")
+            finish_reason = choice0.get("finish_reason")
+            usage = payload.get("usage", {}) or {}
+            request_id = payload.get("id")
             pred = normalize(extract_predicted(text))
             ok = (pred == gold)
             status = "OK" if ok else "WRONG"
@@ -116,25 +230,91 @@ def main():
             pred = ""
             status = "ERROR"
             errors += 1
-        elapsed = time.time() - ts
+            ok = False
+        wall_time_s = time.time() - ts
+
+        # Per-Q instrumentation record.
+        prompt_tokens = usage.get("prompt_tokens", 0) if isinstance(usage, dict) else 0
+        completion_tokens = (
+            usage.get("completion_tokens", 0) if isinstance(usage, dict) else 0
+        )
+        total_tokens = usage.get("total_tokens", 0) if isinstance(usage, dict) else 0
+        decode_tok_s = (
+            (completion_tokens / wall_time_s)
+            if (completion_tokens and wall_time_s > 0) else 0.0
+        )
+        output_len = len(text) if isinstance(text, str) else 0
+        try:
+            output_sha256 = hashlib.sha256(
+                (text if isinstance(text, str) else "").encode("utf-8", "replace")
+            ).hexdigest()[:16]
+        except Exception:
+            output_sha256 = ""
+
+        perq_rec = {
+            "label": args.label,
+            "run_index": args.run_index,
+            "prompt_index": i + 1,  # 1-based, matches "[N/50]" log format
+            "wall_time_s": round(wall_time_s, 4),
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": total_tokens,
+            "decode_tok_s": round(decode_tok_s, 3),
+            "finish_reason": finish_reason,
+            "gold": gold,
+            "pred": pred,
+            "correct": bool(ok),
+            "output_len": output_len,
+            "output_sha256": output_sha256,
+            "request_id": request_id,
+            "ts": ts,
+        }
+        if perq_fh is not None:
+            try:
+                perq_fh.write(json.dumps(perq_rec, separators=(",", ":")) + "\n")
+            except Exception:
+                pass
 
+        # Aggregate JSON results (same shape as before; do not break callers).
         results.append({
             "i": i,
             "expected": gold,
             "got": pred,
             "status": status,
-            "elapsed": round(elapsed, 1),
+            "elapsed": round(wall_time_s, 1),
             "raw_tail": text[-200:] if isinstance(text, str) else "",
             "question": q[:80] + "..." if len(q) > 80 else q,
+            # New non-breaking fields (additive, do not alter existing keys):
+            "wall_time_s": round(wall_time_s, 4),
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": completion_tokens,
+            "total_tokens": total_tokens,
+            "decode_tok_s": round(decode_tok_s, 3),
+            "finish_reason": finish_reason,
+            "output_len": output_len,
+            "output_sha256": output_sha256,
+            "request_id": request_id,
         })
 
-        # progress on stderr
+        # progress on stderr (preserve existing format for log greppability)
         sys.stderr.write(
-            f"[{i + 1}/{args.n}] {status} (gold={gold} pred={pred}) {elapsed:.1f}s\n"
+            f"[{i + 1}/{args.n}] {status} (gold={gold} pred={pred}) "
+            f"{wall_time_s:.1f}s ct={completion_tokens} dtok/s={decode_tok_s:.2f} "
+            f"fr={finish_reason}\n"
         )
         sys.stderr.flush()
 
+        # Mid-eval metrics snapshots (after Q10/20/30/40/50). Done AFTER
+        # wall_time_s is recorded, so snapshot cost is never charged to a
+        # question's decode latency.
+        if (i + 1) in (10, 20, 30, 40, 50):
+            _snapshot_metrics(args.metrics_url, f"q{i + 1}", perq_dir)
+
     total_t = time.time() - t0
+
+    # Post-eval snapshot (NOT charged to wall).
+    _snapshot_metrics(args.metrics_url, "post", perq_dir)
+
     out = {
         "label": args.label,
         "model": args.model,
@@ -146,10 +326,17 @@ def main():
         "accuracy": f"{correct}/{args.n} ({100*correct/args.n:.1f}%)",
         "total_seconds": round(total_t, 1),
         "results": results,
+        # Additive aggregate fields (won't break existing parsers).
+        "run_index": args.run_index,
     }
     if args.save:
         with open(args.save, "w") as f:
             json.dump(out, f, indent=2)
+    if perq_fh is not None:
+        try:
+            perq_fh.close()
+        except Exception:
+            pass
     print(json.dumps({k: v for k, v in out.items() if k != "results"}, indent=2))
 
 
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 928d66bccbba..61e247747395 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1031,8 +1031,11 @@ def _init_kv_zero_meta(self) -> None:
         Called from gpu_worker.py outside the CuMem pool context.
         """
         self._kv_block_zeroer = KVBlockZeroer(self.device, self.pin_memory)
+        # Materialize the attn-groups iterator so KVBlockZeroer can walk it
+        # twice (once for full-attn segments, once for Mamba state tensors).
+        attn_groups_list = list(self._kv_cache_spec_attn_group_iterator())
         self._kv_block_zeroer.init_meta(
-            attn_groups_iter=self._kv_cache_spec_attn_group_iterator(),
+            attn_groups_iter=attn_groups_list,
             kernel_block_sizes=self._kernel_block_sizes,
             cache_dtype=self.cache_config.cache_dtype,
             runner_only_attn_layers=self.runner_only_attn_layers,
diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py
index 83fc12cb5c3b..0335d5e0e4d4 100644
--- a/vllm/v1/worker/utils.py
+++ b/vllm/v1/worker/utils.py
@@ -77,6 +77,95 @@ def _zero_kv_blocks_kernel(
     tl.store(ptr + offset + cols, tl.zeros([BLOCK_SIZE], dtype=tl.int32))
 
 
+# Ops-visibility counter for the SSM zero-on-realloc pass. Logged at first
+# fire (count==1) and every 100th fire thereafter. Module-global; survives
+# across requests within a worker process. EngineCore strips env vars on
+# subprocess spawn, so this is intentionally not env-gated.
+_SSM_ZERO_FIRE_COUNT = 0
+
+
+def _ssm_zero_fired(n_block_ids: int) -> None:
+    """Increment fire counter and log at first fire + every 100th."""
+    global _SSM_ZERO_FIRE_COUNT
+    _SSM_ZERO_FIRE_COUNT += 1
+    if _SSM_ZERO_FIRE_COUNT == 1:
+        logger.info(
+            "nvllm.ssm_zero_on_realloc.first_fire n_block_ids=%d",
+            n_block_ids,
+        )
+    elif _SSM_ZERO_FIRE_COUNT % 100 == 0:
+        logger.info(
+            "nvllm.ssm_zero_on_realloc.fire_count count=%d",
+            _SSM_ZERO_FIRE_COUNT,
+        )
+
+
+class MambaBlockZeroer:
+    """Zeroes Mamba conv_state / ssm_state rows for given block IDs.
+
+    Mamba state tensors have a per-layer leading "block" dim equal to the
+    number of blocks; row ``[block_id]`` is one block of state. The full-attn
+    KVBlockZeroer Triton kernel assumes a uniform page size across all
+    registered segments, which does not hold once Mamba layers are mixed in
+    (conv vs ssm vs attn page sizes all differ). This zeroer instead uses
+    PyTorch index-assignment per registered state tensor: simple, idempotent,
+    and called only at request-free / block-realloc time (not in the hot
+    decode path).
+    """
+
+    def __init__(self, device: torch.device, pin_memory: bool):
+        self.device = device
+        self.pin_memory = pin_memory
+        self._tensors: list[torch.Tensor] = []
+
+    def init_meta(
+        self,
+        attn_groups_iter: Iterable["AttentionGroup"],
+        static_forward_context: dict[str, Any],
+    ) -> None:
+        seen: set[int] = set()
+        for group in attn_groups_iter:
+            spec = group.kv_cache_spec
+            if not isinstance(spec, MambaSpec):
+                continue
+            for layer_name in group.layer_names:
+                layer = static_forward_context.get(layer_name, None)
+                if layer is None:
+                    continue
+                kv = getattr(layer, "kv_cache", None)
+                # Mamba layers bind kv_cache as a list/tuple of state tensors
+                # (conv_state, ssm_state, ...). Each tensor's leading dim is
+                # num_blocks.
+                if not isinstance(kv, (list, tuple)):
+                    continue
+                for state in kv:
+                    if not isinstance(state, torch.Tensor):
+                        continue
+                    key = state.data_ptr()
+                    if key in seen:
+                        continue
+                    seen.add(key)
+                    self._tensors.append(state)
+
+    def zero_block_ids(self, block_ids: list[int]) -> None:
+        if not block_ids or not self._tensors:
+            return
+        # Same-step block_ids are tiny (<= max_num_seqs), so a fresh tensor is
+        # cheaper than a pinned buffer. Hybrid configs use a single block-id
+        # space (uniform page-size padding), so IDs from any per-spec manager
+        # are valid for every per-spec tensor; we defensively filter to each
+        # tensor's leading dim to survive future config drift.
+        idx_cpu = torch.tensor(block_ids, dtype=torch.long)
+        for state in self._tensors:
+            n = state.shape[0]
+            mask = (idx_cpu >= 0) & (idx_cpu < n)
+            local_ids = idx_cpu[mask] if not mask.all() else idx_cpu
+            if local_ids.numel() == 0:
+                continue
+            idx_gpu = local_ids.to(device=self.device, non_blocking=True)
+            state.index_fill_(0, idx_gpu, 0)
+
+
 class KVBlockZeroer:
     """Manages efficient zeroing of KV cache blocks via a Triton kernel.
 
@@ -92,6 +181,7 @@ def __init__(self, device: torch.device, pin_memory: bool):
         self._id_cap: int = 0
         self._ids_pinned: torch.Tensor | None = None
         self._ids_gpu: torch.Tensor | None = None
+        self._mamba_zeroer: MambaBlockZeroer | None = None
 
     def init_meta(
         self,
@@ -112,8 +202,21 @@ def init_meta(
         PAGE_SIZE_EL accounts for this ratio so that
         ``block_id * PAGE_SIZE_EL`` lands at the correct offset.
 
-        Only AttentionSpec layers are processed; Mamba layers are skipped.
+        Full-attention layers go through the Triton zeroing kernel below.
+        Mamba layers are handed off to MambaBlockZeroer for a per-tensor
+        PyTorch ``index_fill_`` because the conv / ssm page sizes differ from
+        the full-attn page size and cannot share PAGE_SIZE_EL.
         """
+        # Materialize the iterator so it can be walked twice (full-attn here +
+        # mamba in MambaBlockZeroer.init_meta below). gpu_model_runner.py
+        # already passes a list; this is defensive against other callers.
+        if not isinstance(attn_groups_iter, (list, tuple)):
+            attn_groups_iter = list(attn_groups_iter)
+
+        # Set up sister zeroer for Mamba layers BEFORE the full-attn walk.
+        self._mamba_zeroer = MambaBlockZeroer(self.device, self.pin_memory)
+        self._mamba_zeroer.init_meta(attn_groups_iter, static_forward_context)
+
         seen_ptrs: set[int] = set()
         seg_addrs: list[int] = []
         page_size_el: int | None = None
@@ -188,6 +291,13 @@ def init_meta(
 
     def zero_block_ids(self, block_ids: list[int]) -> None:
         """Zero the KV cache memory for the given block IDs."""
+        # Mamba layers are zeroed via PyTorch index_fill_; the sister zeroer
+        # tolerates the full block-ID list because it filters per-tensor by
+        # leading dim. Runs unconditionally so recycled conv_state / ssm_state
+        # rows are cleared before the next prefill writes into them.
+        if block_ids and self._mamba_zeroer is not None:
+            self._mamba_zeroer.zero_block_ids(block_ids)
+            _ssm_zero_fired(len(block_ids))
         if not block_ids or self._meta is None:
             return
         seg_addrs, page_size_el, blk_size, n_segs = self._meta