diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/ANALYSIS.md b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/ANALYSIS.md new file mode 100644 index 000000000000..1084aab57101 --- /dev/null +++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/ANALYSIS.md @@ -0,0 +1,120 @@ +# SSM zero-on-realloc ablation: 4-arm comparison + +- OUT_DIR: `/tmp/ssm_ablation_suite_v2` +- git_sha: `670724746c596f6c095970c4d50b82e6328423db` +- image: `nvllm:gb10-d2_7` +- N runs per arm: 5 +- gsm8k_floor: 45 + +## Verdict table (run x correct/errors) + +| Arm | SSM | KV | Run 1 | Run 2 | Run 3 | Run 4 | Run 5 | Gate | +|-----|-----|----|-------|-------|-------|-------|-------|------| +| both | 1 | 1 | 48/0err | 48/0err | 48/0err | 48/0err | 48/0err | True | +| neither | 0 | 0 | 48/0err | 48/0err | 48/0err | 48/0err | 48/0err | True | +| ssm_only | 1 | 0 | 48/0err | 48/0err | 48/0err | 48/0err | 48/0err | True | +| kv_only | 0 | 1 | 47/0err | 47/0err | 47/0err | 47/0err | 47/0err | True | + +## Per-question table - Run 4 (collapse window) + +Columns per arm: lat (wall_time_s), ct (completion_tokens), dtok/s (decode_tok_s), fr (finish_reason), ok (correct). + +| Q | both:lat | both:ct | both:dtok/s | both:fr | both:ok | neither:lat | neither:ct | neither:dtok/s | neither:fr | neither:ok | ssm_only:lat | ssm_only:ct | ssm_only:dtok/s | ssm_only:fr | ssm_only:ok | kv_only:lat | kv_only:ct | kv_only:dtok/s | kv_only:fr | kv_only:ok | +|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---| +| 1 | 16.47 | 153 | 9.29 | stop | N | 16.43 | 153 | 9.31 | stop | N | 16.43 | 153 | 9.31 | stop | N | 16.68 | 155 | 9.29 | stop | N | +| 2 | 23.52 | 217 | 9.23 | stop | Y | 23.53 | 217 | 9.22 | stop | Y | 23.54 | 217 | 9.22 | stop | Y | 12.73 | 116 | 9.11 | stop | Y | +| 3 | 11.60 | 106 | 9.14 | stop | Y | 11.60 | 106 | 9.13 | stop | Y | 11.61 | 106 | 9.13 | stop | Y | 13.88 | 127 | 9.15 | stop | Y | +| 4 | 19.42 | 179 | 9.21 | stop | Y | 19.43 | 179 | 9.21 | stop | Y | 19.44 | 179 | 9.21 | stop | Y | 20.66 | 190 | 9.20 | stop | Y | +| 5 | 20.82 | 192 | 9.22 | stop | Y | 20.81 | 192 | 9.22 | stop | Y | 20.83 | 192 | 9.22 | stop | Y | 16.46 | 151 | 9.18 | stop | Y | +| 6 | 13.74 | 126 | 9.17 | stop | Y | 13.74 | 126 | 9.17 | stop | Y | 13.76 | 126 | 9.16 | stop | Y | 14.43 | 132 | 9.15 | stop | Y | +| 7 | 13.63 | 125 | 9.17 | stop | Y | 13.64 | 125 | 9.17 | stop | Y | 13.65 | 125 | 9.16 | stop | Y | 11.20 | 102 | 9.11 | stop | Y | +| 8 | 19.64 | 181 | 9.22 | stop | Y | 19.65 | 181 | 9.21 | stop | Y | 19.63 | 181 | 9.22 | stop | Y | 15.28 | 140 | 9.16 | stop | N | +| 9 | 10.10 | 92 | 9.11 | stop | Y | 10.10 | 92 | 9.11 | stop | Y | 10.10 | 92 | 9.11 | stop | Y | 10.12 | 92 | 9.09 | stop | Y | +| 10 | 13.21 | 121 | 9.16 | stop | Y | 13.21 | 121 | 9.16 | stop | Y | 13.20 | 121 | 9.16 | stop | Y | 10.88 | 99 | 9.10 | stop | Y | +| 11 | 19.55 | 180 | 9.21 | stop | Y | 19.56 | 180 | 9.21 | stop | Y | 19.55 | 180 | 9.21 | stop | Y | 45.53 | 421 | 9.25 | stop | Y | +| 12 | 21.24 | 196 | 9.23 | stop | Y | 21.25 | 196 | 9.22 | stop | Y | 21.24 | 196 | 9.23 | stop | Y | 22.49 | 207 | 9.21 | stop | Y | +| 13 | 16.96 | 156 | 9.20 | stop | Y | 16.97 | 156 | 9.19 | stop | Y | 16.96 | 156 | 9.20 | stop | Y | 18.29 | 168 | 9.18 | stop | Y | +| 14 | 15.02 | 138 | 9.19 | stop | Y | 15.03 | 138 | 9.18 | stop | Y | 15.02 | 138 | 9.19 | stop | Y | 14.74 | 135 | 9.16 | stop | Y | +| 15 | 14.49 | 133 | 9.18 | stop | Y | 14.50 | 133 | 9.17 | stop | Y | 14.48 | 133 | 9.18 | stop | Y | 16.16 | 148 | 9.16 | stop | Y | +| 16 | 11.28 | 103 | 9.13 | stop | Y | 11.29 | 103 | 9.13 | stop | Y | 11.27 | 103 | 9.14 | stop | Y | 10.98 | 100 | 9.10 | stop | Y | +| 17 | 11.38 | 104 | 9.14 | stop | Y | 11.39 | 104 | 9.13 | stop | Y | 11.38 | 104 | 9.14 | stop | Y | 14.21 | 130 | 9.15 | stop | Y | +| 18 | 13.75 | 126 | 9.17 | stop | Y | 13.75 | 126 | 9.17 | stop | Y | 13.74 | 126 | 9.17 | stop | Y | 13.03 | 119 | 9.13 | stop | Y | +| 19 | 11.17 | 102 | 9.13 | stop | Y | 11.17 | 102 | 9.13 | stop | Y | 11.17 | 102 | 9.13 | stop | Y | 10.65 | 97 | 9.10 | stop | Y | +| 20 | 9.98 | 91 | 9.11 | stop | Y | 9.99 | 91 | 9.11 | stop | Y | 10.00 | 91 | 9.10 | stop | Y | 9.38 | 85 | 9.07 | stop | Y | +| 21 | 13.41 | 123 | 9.17 | stop | Y | 13.42 | 123 | 9.17 | stop | Y | 13.41 | 123 | 9.17 | stop | Y | 13.34 | 122 | 9.14 | stop | Y | +| 22 | 26.52 | 245 | 9.24 | stop | Y | 26.52 | 245 | 9.24 | stop | Y | 26.51 | 245 | 9.24 | stop | Y | 26.26 | 242 | 9.22 | stop | Y | +| 23 | 9.67 | 88 | 9.10 | stop | Y | 9.67 | 88 | 9.10 | stop | Y | 9.67 | 88 | 9.10 | stop | Y | 13.24 | 121 | 9.14 | stop | Y | +| 24 | 15.45 | 142 | 9.19 | stop | Y | 15.46 | 142 | 9.19 | stop | Y | 15.44 | 142 | 9.19 | stop | Y | 15.49 | 142 | 9.17 | stop | Y | +| 25 | 35.08 | 325 | 9.26 | stop | Y | 35.09 | 325 | 9.26 | stop | Y | 35.08 | 325 | 9.27 | stop | Y | 25.08 | 231 | 9.21 | stop | Y | +| 26 | 16.85 | 155 | 9.20 | stop | Y | 16.86 | 155 | 9.19 | stop | Y | 16.85 | 155 | 9.20 | stop | Y | 20.34 | 187 | 9.19 | stop | Y | +| 27 | 14.28 | 131 | 9.17 | stop | Y | 14.27 | 131 | 9.18 | stop | Y | 14.28 | 131 | 9.18 | stop | Y | 9.59 | 87 | 9.07 | stop | Y | +| 28 | 24.69 | 228 | 9.24 | stop | Y | 24.69 | 228 | 9.23 | stop | Y | 24.68 | 228 | 9.24 | stop | Y | 27.98 | 258 | 9.22 | stop | Y | +| 29 | 16.75 | 154 | 9.20 | stop | Y | 16.76 | 154 | 9.19 | stop | Y | 16.75 | 154 | 9.19 | stop | Y | 17.01 | 156 | 9.17 | stop | Y | +| 30 | 11.38 | 104 | 9.14 | stop | Y | 11.37 | 104 | 9.15 | stop | Y | 11.37 | 104 | 9.14 | stop | Y | 10.33 | 94 | 9.10 | stop | Y | +| 31 | 19.22 | 177 | 9.21 | stop | Y | 19.22 | 177 | 9.21 | stop | Y | 19.22 | 177 | 9.21 | stop | Y | 20.03 | 184 | 9.19 | stop | Y | +| 32 | 17.93 | 165 | 9.20 | stop | Y | 17.93 | 165 | 9.20 | stop | Y | 17.93 | 165 | 9.20 | stop | Y | 18.95 | 174 | 9.18 | stop | Y | +| 33 | 14.49 | 133 | 9.18 | stop | Y | 14.50 | 133 | 9.17 | stop | Y | 14.49 | 133 | 9.18 | stop | Y | 18.19 | 167 | 9.18 | stop | Y | +| 34 | 25.86 | 239 | 9.24 | stop | Y | 25.87 | 239 | 9.24 | stop | Y | 25.85 | 239 | 9.25 | stop | Y | 24.64 | 227 | 9.21 | stop | Y | +| 35 | 34.01 | 315 | 9.26 | stop | Y | 34.03 | 315 | 9.26 | stop | Y | 34.02 | 315 | 9.26 | stop | Y | 20.66 | 190 | 9.19 | stop | Y | +| 36 | 14.74 | 135 | 9.16 | stop | Y | 14.72 | 135 | 9.17 | stop | Y | 14.72 | 135 | 9.17 | stop | Y | 16.15 | 148 | 9.16 | stop | Y | +| 37 | 21.80 | 201 | 9.22 | stop | Y | 21.79 | 201 | 9.22 | stop | Y | 21.79 | 201 | 9.22 | stop | Y | 21.95 | 202 | 9.20 | stop | Y | +| 38 | 21.02 | 194 | 9.23 | stop | Y | 21.03 | 194 | 9.22 | stop | Y | 21.03 | 194 | 9.22 | stop | Y | 21.30 | 196 | 9.20 | stop | Y | +| 39 | 16.74 | 154 | 9.20 | stop | Y | 16.75 | 154 | 9.20 | stop | Y | 16.74 | 154 | 9.20 | stop | Y | 17.21 | 158 | 9.18 | stop | Y | +| 40 | 20.17 | 186 | 9.22 | stop | Y | 20.18 | 186 | 9.21 | stop | Y | 20.18 | 186 | 9.22 | stop | Y | 12.38 | 113 | 9.13 | stop | Y | +| 41 | 25.83 | 239 | 9.25 | stop | Y | 25.86 | 239 | 9.24 | stop | Y | 25.88 | 239 | 9.23 | stop | Y | 30.79 | 284 | 9.22 | stop | Y | +| 42 | 17.81 | 164 | 9.21 | stop | Y | 17.82 | 164 | 9.20 | stop | Y | 17.83 | 164 | 9.20 | stop | Y | 17.00 | 156 | 9.17 | stop | Y | +| 43 | 11.60 | 106 | 9.14 | stop | Y | 11.61 | 106 | 9.13 | stop | Y | 11.60 | 106 | 9.13 | stop | Y | 16.25 | 149 | 9.17 | stop | Y | +| 44 | 9.03 | 82 | 9.09 | stop | Y | 9.03 | 82 | 9.08 | stop | Y | 9.02 | 82 | 9.09 | stop | Y | 12.81 | 117 | 9.13 | stop | Y | +| 45 | 55.13 | 512 | 9.29 | length | N | 55.18 | 512 | 9.28 | length | N | 55.15 | 512 | 9.28 | length | N | 55.28 | 512 | 9.26 | length | N | +| 46 | 21.28 | 198 | 9.30 | stop | Y | 21.26 | 198 | 9.31 | stop | Y | 21.26 | 198 | 9.31 | stop | Y | 25.19 | 234 | 9.29 | stop | Y | +| 47 | 20.08 | 185 | 9.21 | stop | Y | 20.08 | 185 | 9.21 | stop | Y | 20.07 | 185 | 9.22 | stop | Y | 20.12 | 185 | 9.19 | stop | Y | +| 48 | 26.50 | 245 | 9.24 | stop | Y | 26.52 | 245 | 9.24 | stop | Y | 26.50 | 245 | 9.24 | stop | Y | 29.47 | 272 | 9.23 | stop | Y | +| 49 | 13.73 | 126 | 9.18 | stop | Y | 13.74 | 126 | 9.17 | stop | Y | 13.74 | 126 | 9.17 | stop | Y | 13.13 | 120 | 9.14 | stop | Y | +| 50 | 22.21 | 205 | 9.23 | stop | Y | 22.22 | 205 | 9.22 | stop | Y | 22.22 | 205 | 9.22 | stop | Y | 14.75 | 135 | 9.15 | stop | Y | + +## Aggregate per-arm steady-state stats (concat across runs) + +| Arm | N | median dtok/s | p95 wall_s | mean completion_tokens | finish_reason hist | +|-----|---|---------------|------------|------------------------|--------------------| +| both | 250 | 9.20 | 34.01 | 169.5 | length=5, stop=245 | +| neither | 250 | 9.19 | 34.03 | 169.5 | length=5, stop=245 | +| ssm_only | 250 | 9.20 | 34.01 | 169.5 | length=5, stop=245 | +| kv_only | 250 | 9.17 | 30.74 | 169.5 | length=5, stop=245 | + +## Friend's interpretation thresholds applied + +- 'real pipeline win' iff median decode_tok_s >= 1.30x baseline ('neither') AND mean completion_tokens >= 0.85x baseline +- 'shortened generations' iff decode rate up but completion_tokens < 0.85x baseline + +- **both**: no decode win vs baseline (decode 1.00x, compt 1.00x) +- **neither**: baseline +- **ssm_only**: no decode win vs baseline (decode 1.00x, compt 1.00x) +- **kv_only**: no decode win vs baseline (decode 1.00x, compt 1.00x) + +## Drained KV invariant (per-run pre vs post) + +Tolerance: |delta| <= 0.05 (5 pp) counts as drained. + +| Arm | Run | KV pre | KV post | delta | drained | +|-----|-----|--------|---------|-------|---------| +| both | 1 | 0.0000 | 0.0027 | 0.0027 | Y | +| both | 2 | 0.0000 | 0.0027 | 0.0027 | Y | +| both | 3 | 0.0000 | 0.0027 | 0.0027 | Y | +| both | 4 | 0.0000 | 0.0027 | 0.0027 | Y | +| both | 5 | 0.0000 | 0.0027 | 0.0027 | Y | +| neither | 1 | 0.0000 | 0.0027 | 0.0027 | Y | +| neither | 2 | 0.0000 | 0.0027 | 0.0027 | Y | +| neither | 3 | 0.0000 | 0.0027 | 0.0027 | Y | +| neither | 4 | 0.0000 | 0.0027 | 0.0027 | Y | +| neither | 5 | 0.0000 | 0.0027 | 0.0027 | Y | +| ssm_only | 1 | 0.0000 | 0.0027 | 0.0027 | Y | +| ssm_only | 2 | 0.0000 | 0.0027 | 0.0027 | Y | +| ssm_only | 3 | 0.0000 | 0.0027 | 0.0027 | Y | +| ssm_only | 4 | 0.0000 | 0.0027 | 0.0027 | Y | +| ssm_only | 5 | 0.0000 | 0.0027 | 0.0027 | Y | +| kv_only | 1 | 0.0000 | 0.0027 | 0.0027 | Y | +| kv_only | 2 | 0.0000 | 0.0027 | 0.0027 | Y | +| kv_only | 3 | 0.0000 | 0.0027 | 0.0027 | Y | +| kv_only | 4 | 0.0000 | 0.0027 | 0.0027 | Y | +| kv_only | 5 | 0.0000 | 0.0027 | 0.0027 | Y | + diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/comparison.json b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/comparison.json new file mode 100644 index 000000000000..569fc0b9fcd7 --- /dev/null +++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/comparison.json @@ -0,0 +1,15 @@ +{ + "out_dir": "/tmp/ssm_ablation_suite_v2", + "git_sha": "670724746c596f6c095970c4d50b82e6328423db", + "image": "nvllm:gb10-d2_7", + "n_runs": 5, + "gsm8k_floor": 45, + "patched_repo": "/tmp/nvllm-ssm-sentinel-patched", + "sentinels_root": "/tmp/nvllm-ablation-sentinels-v2", + "arms": [ + {"arm": "both", "ssm_sentinel": 1, "kv_sentinel": 1, "gate_pass": "true", "correct_per_run": "48,48,48,48,48", "verdict": "/tmp/ssm_ablation_suite_v2/both/verdict.json"}, + {"arm": "neither", "ssm_sentinel": 0, "kv_sentinel": 0, "gate_pass": "true", "correct_per_run": "48,48,48,48,48", "verdict": "/tmp/ssm_ablation_suite_v2/neither/verdict.json"}, + {"arm": "ssm_only", "ssm_sentinel": 1, "kv_sentinel": 0, "gate_pass": "true", "correct_per_run": "48,48,48,48,48", "verdict": "/tmp/ssm_ablation_suite_v2/ssm_only/verdict.json"}, + {"arm": "kv_only", "ssm_sentinel": 0, "kv_sentinel": 1, "gate_pass": "true", "correct_per_run": "47,47,47,47,47", "verdict": "/tmp/ssm_ablation_suite_v2/kv_only/verdict.json"} + ] +} diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/runner_manifest.json b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/runner_manifest.json new file mode 100644 index 000000000000..29ed9dc97670 --- /dev/null +++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/runner_manifest.json @@ -0,0 +1,20 @@ +{ + "runner": "/tmp/run_ablation_suite_v2.sh", + "started_utc": "2026-05-15T18:17:46Z", + "git_sha": "670724746c596f6c095970c4d50b82e6328423db", + "image": "nvllm:gb10-d2_7", + "image_id": "nvllm:gb10-d2_7@4df53234ad5c", + "image_digest": "no-digest", + "patched_repo": "/tmp/nvllm-ssm-sentinel-patched", + "sentinels_root": "/tmp/nvllm-ablation-sentinels-v2", + "host_name": "navi-ai", + "host_driver": "590.48.01", + "host_kernel": "6.17.0-1014-nvidia", + "gsm8k_n": 50, + "gsm8k_seed": 42, + "gsm8k_max_tokens": 512, + "prompt_set_hash": "f422bd91dd644cc1a8afce282e51732977e1e1e5c361e894287f8eed5792e2cf", + "hf_model": "ig1/Qwen3.5-27B-NVFP4", + "n_runs": 5, + "arms": ["both", "neither", "ssm_only", "kv_only"] +} diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/summary.md b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/summary.md new file mode 100644 index 000000000000..6df72ccc678f --- /dev/null +++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/summary.md @@ -0,0 +1,137 @@ +# SSM zero-on-realloc — 4-arm sentinel ablation (2026-05-15) + +## Status + +**Harness validation only. No perf claim. No nsys trace.** + +The β-coop sustained-load collapse this work was designed to discriminate +was not reproducing on the host on 2026-05-15. Per friend's reframing: +> "If all arms pass and counters prove toggles worked: result is 'patch +> not harmful under non-repro conditions; collapse not available for +> discrimination.'" + +That is the result here. The harness is committed for the next collapse +window. + +## What ran + +- **Suite:** 4 arms (both / neither / ssm_only / kv_only), 5×GSM8K-50 + runs per arm, fresh container per arm, sentinel files at + `/run/nvllm/*.enabled` bind-mounted `:ro` per arm. +- **Total:** 20 runs (1000 generations), ~5h 25min wall (14:17→19:41 + EDT). 0 errors, 0 OOMs, 0 container restarts. +- **Suite code:** `scripts/ablation/run_ssm_ablation_suite.sh`, + `scripts/ablation/ssm_ablation_compare.py`, overlay applied via + `scripts/ablation/prepare_sentinel_overlay.sh`. + +## Host / image manifest + +| Field | Value | +|---|---| +| started_utc | 2026-05-15T18:17:46Z | +| git_sha | `670724746c596f6c095970c4d50b82e6328423db` (`plan/beta-coop-layer-sweep-wo8` head at suite time) | +| image | `nvllm:gb10-d2_7` | +| image_id | `nvllm:gb10-d2_7@4df53234ad5c` | +| image_digest | `no-digest` (local-built image, never pushed) | +| host_driver | `590.48.01` | +| host_kernel | `6.17.0-1014-nvidia` | +| hardware | NVIDIA DGX Spark (GB10, SM120, 128 GB unified) | +| hf_model | `ig1/Qwen3.5-27B-NVFP4` | +| served_name | `default` | +| gsm8k_n | 50 | +| gsm8k_seed | 42 | +| gsm8k_max_tokens | 512 | +| prompt_set_hash | `f422bd91dd644cc1a8afce282e51732977e1e1e5c361e894287f8eed5792e2cf` (sha256 of `n|seed|model|served-name`) | +| phase_e_layers | `3,7` | +| wo_split | 8 | + +## Per-arm verdict + +| Arm | SSM sentinel | KV sentinel | runs (correct/50) | first_fire (ssm,kv) | gate_pass | harness_pass | +|---|---|---|---|---|---|---| +| `both` | 1 | 1 | 48,48,48,48,48 | (1, 1) | true | true | +| `neither` | 0 | 0 | 48,48,48,48,48 | (0, 0) | true | true | +| `ssm_only` | 1 | 0 | 48,48,48,48,48 | (1, 0) | true | true | +| `kv_only` | 0 | 1 | 47,47,47,47,47 | (0, 1) | true | true | + +`harness_pass=true` for all four arms means: when SSM_sentinel=1 the SSM +gate fired (and not when SSM_sentinel=0); same for KV. The sentinel +machinery is proven to discriminate. The env-strip confound from a prior +env-gated attempt is eliminated. + +## What this shows and does not show + +**Shows:** +- Sentinel-file gating works through vLLM EngineCore (env-stripped) where + env-var gating did not. +- Under non-collapsing host state, the SSM zero-on-realloc patch is + correctness-neutral (`both` and `ssm_only` both 48/50, identical to + `neither` baseline) and perf-neutral (median decode within 0.03 tok/s + across all arms). +- The KV `new_block_ids` channel relax (kv_only arm) is **NOT + correctness-neutral**: a deterministic -1 question across all 5 runs. + That is the basis for shipping the SSM patch alone in the production + commit and keeping the KV relax in the harness overlay only. + +**Does not show:** +- Whether the SSM patch fixes the β-coop sustained-load collapse: the + collapse did not reproduce under today's host state. +- Any performance win: median decode is flat across arms; no nsys trace + was captured. + +## Per-arm steady-state stats + +(See `ANALYSIS.md` Section "Aggregate per-arm steady-state stats".) + +| Arm | N | median dtok/s | p95 wall_s | mean completion_tokens | finish_reason | +|---|---|---|---|---|---| +| both | 250 | 9.20 | 34.01 | 169.5 | length=5, stop=245 | +| neither | 250 | 9.19 | 34.03 | 169.5 | length=5, stop=245 | +| ssm_only | 250 | 9.20 | 34.01 | 169.5 | length=5, stop=245 | +| kv_only | 250 | 9.17 | 30.74 | 169.5 | length=5, stop=245 | + +Note: per-arm 50-question completion-token sums all land at 8477 tokens +(mean 169.54). Per-Q values do differ (e.g. `kv_only` Q2 = 116 tokens vs +217 for the other three arms — verified distinct via output sha256 and +output_len), but per-arm sums coincide. This is a chance numerical +balance, not a stat-collection bug. + +## Drained KV invariant + +All 20 runs drained KV cleanly: `vllm:kv_cache_usage_perc` returned to +≤0.3pp of baseline at the post-run snapshot, well inside the 5pp +tolerance. (See `ANALYSIS.md` for the full per-run table.) + +## How to reproduce + +```bash +# 1. Build a sentinel-overlaid scratch checkout (~5 sec). +scripts/ablation/prepare_sentinel_overlay.sh /tmp/nvllm-ssm-sentinel-patched + +# 2. Run the 4-arm sweep (~3 h with default 5 runs x 4 arms x ~15 min/run). +scripts/ablation/run_ssm_ablation_suite.sh + +# 3. Produce ANALYSIS.md from the per-arm verdicts. +.venv/bin/python scripts/ablation/ssm_ablation_compare.py /tmp/ssm_ablation_suite +``` + +Env overrides for the runner are documented in the script header (see +`scripts/ablation/run_ssm_ablation_suite.sh`). + +## What is NOT committed + +- Per-arm `docker.log` (×4, ~50 MB each) +- Per-arm `serve.log` (×4) +- Per-run per-Q `perq.jsonl` (×20 = 1000 records, ~600 KB) +- Per-run `metrics_*.json` (×120 snapshots) +- The full mamba slot trace (~750 events × 4 arms) + +These artifacts live in the suite OUT_DIR +(`/tmp/ssm_ablation_suite_v2/` at run time) and can be regenerated by +re-running the harness against the committed scripts. + +## Related + +- `docs/research/2026-05-15-ssm-zero-on-realloc/README.md` — design + harness usage +- Production patch: commit `feat(worker): add MambaBlockZeroer sister zeroer for SSM zero-on-realloc` +- Harness commit: `test(ablation): sentinel-gated SSM zero-on-realloc ablation harness` diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-both.json b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-both.json new file mode 100644 index 000000000000..3bf8164430c1 --- /dev/null +++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-both.json @@ -0,0 +1,36 @@ +{ + "arm": "both", + "ssm_sentinel": 1, + "kv_sentinel": 1, + "hypothesis": "ssm_zero_on_realloc_ablation_sentinel_gated", + "patched_repo": "/tmp/nvllm-ssm-sentinel-patched", + "sentinel_dir": "/tmp/nvllm-ablation-sentinels-v2/both", + "sentinel_files_inside": "kv_zero_for_mamba_ids.enabled,zero_ssm_on_realloc.enabled", + "container_id": "f3febab6e399ffc388d45b35e2d0cf12b525d9af2b9034df67969c80b8e8953e", + "host_driver": "590.48.01", + "prompt_set_hash": "f422bd91dd644cc1a8afce282e51732977e1e1e5c361e894287f8eed5792e2cf", + "harness_validation": {"pass": true, "reason": "ok", "ssm_first_fire": 1, "kv_first_fire": 1}, + "git_sha": "670724746c596f6c095970c4d50b82e6328423db", + "image": "nvllm:gb10-d2_7", + "image_id": "nvllm:gb10-d2_7@4df53234ad5c", + "phase_e_layers": "3,7", + "phase_e_fusion": 1, + "phase_e_path": "auto", + "wo_split": 8, + "n_runs": 5, + "gsm8k_floor": 45, + "mamba_slot_trace_lines": 753, + "ablation_events": {"sentinel_check": 2, "first_fire": 2, "fire_count": 9}, + "token_summary": {"n_questions": 250, "sum_completion_tokens": 42385, "sum_prompt_tokens": 18405, "median_wall_time_s": 16.80155, "median_decode_tok_s": 9.198}, + "runs": [ + {"run": 1, "correct": 48, "errors": 0, "pass": true}, + {"run": 2, "correct": 48, "errors": 0, "pass": true}, + {"run": 3, "correct": 48, "errors": 0, "pass": true}, + {"run": 4, "correct": 48, "errors": 0, "pass": true}, + {"run": 5, "correct": 48, "errors": 0, "pass": true} + ], + "container_alive_at_end": true, + "docker_log_corruption_hits": 0, + "gate_pass": true, + "harness_pass": true +} diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-kv_only.json b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-kv_only.json new file mode 100644 index 000000000000..962b11c78550 --- /dev/null +++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-kv_only.json @@ -0,0 +1,36 @@ +{ + "arm": "kv_only", + "ssm_sentinel": 0, + "kv_sentinel": 1, + "hypothesis": "ssm_zero_on_realloc_ablation_sentinel_gated", + "patched_repo": "/tmp/nvllm-ssm-sentinel-patched", + "sentinel_dir": "/tmp/nvllm-ablation-sentinels-v2/kv_only", + "sentinel_files_inside": "kv_zero_for_mamba_ids.enabled", + "container_id": "20d76a07f0aff2648ef278a290a3555915ad4914929f017ef98461476a7d0e6d", + "host_driver": "590.48.01", + "prompt_set_hash": "f422bd91dd644cc1a8afce282e51732977e1e1e5c361e894287f8eed5792e2cf", + "harness_validation": {"pass": true, "reason": "ok", "ssm_first_fire": 0, "kv_first_fire": 1}, + "git_sha": "670724746c596f6c095970c4d50b82e6328423db", + "image": "nvllm:gb10-d2_7", + "image_id": "nvllm:gb10-d2_7@4df53234ad5c", + "phase_e_layers": "3,7", + "phase_e_fusion": 1, + "phase_e_path": "auto", + "wo_split": 8, + "n_runs": 5, + "gsm8k_floor": 45, + "mamba_slot_trace_lines": 753, + "ablation_events": {"sentinel_check": 2, "first_fire": 1, "fire_count": 7}, + "token_summary": {"n_questions": 250, "sum_completion_tokens": 42385, "sum_prompt_tokens": 18405, "median_wall_time_s": 16.3489, "median_decode_tok_s": 9.1695}, + "runs": [ + {"run": 1, "correct": 47, "errors": 0, "pass": true}, + {"run": 2, "correct": 47, "errors": 0, "pass": true}, + {"run": 3, "correct": 47, "errors": 0, "pass": true}, + {"run": 4, "correct": 47, "errors": 0, "pass": true}, + {"run": 5, "correct": 47, "errors": 0, "pass": true} + ], + "container_alive_at_end": true, + "docker_log_corruption_hits": 0, + "gate_pass": true, + "harness_pass": true +} diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-neither.json b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-neither.json new file mode 100644 index 000000000000..a190c2389e17 --- /dev/null +++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-neither.json @@ -0,0 +1,36 @@ +{ + "arm": "neither", + "ssm_sentinel": 0, + "kv_sentinel": 0, + "hypothesis": "ssm_zero_on_realloc_ablation_sentinel_gated", + "patched_repo": "/tmp/nvllm-ssm-sentinel-patched", + "sentinel_dir": "/tmp/nvllm-ablation-sentinels-v2/neither", + "sentinel_files_inside": "", + "container_id": "f966c41b02ef6a455464d93361104fc49ab4d5420b21ec159d5ca37018c03559", + "host_driver": "590.48.01", + "prompt_set_hash": "f422bd91dd644cc1a8afce282e51732977e1e1e5c361e894287f8eed5792e2cf", + "harness_validation": {"pass": true, "reason": "ok", "ssm_first_fire": 0, "kv_first_fire": 0}, + "git_sha": "670724746c596f6c095970c4d50b82e6328423db", + "image": "nvllm:gb10-d2_7", + "image_id": "nvllm:gb10-d2_7@4df53234ad5c", + "phase_e_layers": "3,7", + "phase_e_fusion": 1, + "phase_e_path": "auto", + "wo_split": 8, + "n_runs": 5, + "gsm8k_floor": 45, + "mamba_slot_trace_lines": 753, + "ablation_events": {"sentinel_check": 2, "first_fire": 0, "fire_count": 0}, + "token_summary": {"n_questions": 250, "sum_completion_tokens": 42385, "sum_prompt_tokens": 18405, "median_wall_time_s": 16.81105, "median_decode_tok_s": 9.193}, + "runs": [ + {"run": 1, "correct": 48, "errors": 0, "pass": true}, + {"run": 2, "correct": 48, "errors": 0, "pass": true}, + {"run": 3, "correct": 48, "errors": 0, "pass": true}, + {"run": 4, "correct": 48, "errors": 0, "pass": true}, + {"run": 5, "correct": 48, "errors": 0, "pass": true} + ], + "container_alive_at_end": true, + "docker_log_corruption_hits": 0, + "gate_pass": true, + "harness_pass": true +} diff --git a/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-ssm_only.json b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-ssm_only.json new file mode 100644 index 000000000000..418754cb3888 --- /dev/null +++ b/benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/verdict-ssm_only.json @@ -0,0 +1,36 @@ +{ + "arm": "ssm_only", + "ssm_sentinel": 1, + "kv_sentinel": 0, + "hypothesis": "ssm_zero_on_realloc_ablation_sentinel_gated", + "patched_repo": "/tmp/nvllm-ssm-sentinel-patched", + "sentinel_dir": "/tmp/nvllm-ablation-sentinels-v2/ssm_only", + "sentinel_files_inside": "zero_ssm_on_realloc.enabled", + "container_id": "ee4e6c9016a4e817239b9f75ce2022a6ce7af6b3713474ffcb1d7f5b1544411f", + "host_driver": "590.48.01", + "prompt_set_hash": "f422bd91dd644cc1a8afce282e51732977e1e1e5c361e894287f8eed5792e2cf", + "harness_validation": {"pass": true, "reason": "ok", "ssm_first_fire": 1, "kv_first_fire": 0}, + "git_sha": "670724746c596f6c095970c4d50b82e6328423db", + "image": "nvllm:gb10-d2_7", + "image_id": "nvllm:gb10-d2_7@4df53234ad5c", + "phase_e_layers": "3,7", + "phase_e_fusion": 1, + "phase_e_path": "auto", + "wo_split": 8, + "n_runs": 5, + "gsm8k_floor": 45, + "mamba_slot_trace_lines": 753, + "ablation_events": {"sentinel_check": 2, "first_fire": 1, "fire_count": 2}, + "token_summary": {"n_questions": 250, "sum_completion_tokens": 42385, "sum_prompt_tokens": 18405, "median_wall_time_s": 16.8064, "median_decode_tok_s": 9.197}, + "runs": [ + {"run": 1, "correct": 48, "errors": 0, "pass": true}, + {"run": 2, "correct": 48, "errors": 0, "pass": true}, + {"run": 3, "correct": 48, "errors": 0, "pass": true}, + {"run": 4, "correct": 48, "errors": 0, "pass": true}, + {"run": 5, "correct": 48, "errors": 0, "pass": true} + ], + "container_alive_at_end": true, + "docker_log_corruption_hits": 0, + "gate_pass": true, + "harness_pass": true +} diff --git a/docs/research/2026-05-15-ssm-zero-on-realloc/README.md b/docs/research/2026-05-15-ssm-zero-on-realloc/README.md new file mode 100644 index 000000000000..7980c5fb984d --- /dev/null +++ b/docs/research/2026-05-15-ssm-zero-on-realloc/README.md @@ -0,0 +1,121 @@ +# SSM zero-on-realloc — design + sentinel ablation harness + +## What + +The production patch (commit `feat(worker): add MambaBlockZeroer sister +zeroer for SSM zero-on-realloc`) adds an SSM zero-on-realloc guard alongside +the existing full-attention KV zero-on-realloc path. + +`KVBlockZeroer.zero_block_ids` now also walks a sister `MambaBlockZeroer` on +the same block-ID list, zeroing recycled `conv_state` / `ssm_state` rows via +`torch.index_fill_` before the next prefill writes into them. + +## Why + +The existing `KVBlockZeroer` (upstream PR #35219) clears full-attn KV blocks +at request-free / block-realloc time but skips Mamba layers because the conv +/ ssm page sizes differ from the full-attn page size and cannot share the +Triton kernel's uniform `PAGE_SIZE_EL`. `MambaBlockZeroer` covers the +remaining state. + +This addresses one half of the suspect set from the Mamba SSM cache +lifecycle audit (memory:`project_mamba_ssm_lifecycle`): +> "what's accumulating in-process between runs that isn't in any cherry-pick" + +Hybrid-attention models (Qwen3.5-27B and similar) hold per-block mamba +state in tensors whose leading dim is `num_blocks`. When a block ID is +recycled to a new request, the old request's mamba state in that slot +would otherwise persist as initial state for the new prefill. + +## What this commit series does NOT claim + +- **No "fixes collapse" claim.** The β-coop sustained-load collapse was not + reproducing on the host at the time of this work (2026-05-15). The patch + is shipped because the lifecycle gap is real; the patch's effect under + the failing host state is unknown. +- **No perf claim.** No nsys trace was captured. The 4-arm sentinel + ablation (below) shows median decode_tok_s within 0.03 tok/s across all + arms (perf-neutral under non-collapse load), but that is not a perf win. + +## The sentinel ablation harness + +The harness in `scripts/ablation/` lets a future operator A/B the patch +under a future collapse window without having to rebuild the image. + +It applies a sentinel overlay (`scripts/ablation/ssm_sentinel_overlay.patch`) +to a scratch checkout of the repo, replacing the production +always-on firing path with a filesystem-sentinel gated version. Per-arm, +the runner bind-mounts a per-arm sentinel directory at `/run/nvllm` :ro; +the gate at module-import-time stats the sentinel file and caches the +result. + +### Why sentinel files, not env vars + +vLLM EngineCore spawns the worker subprocess with most env vars stripped +(memory:`feedback_vllm_enginecore_env_strip`); only `VLLM_TARGET_DEVICE` +and `VLLM_WORKER_MULTIPROC_METHOD` survive. A previous env-gated +ablation (`v1`) was a null A/B because the gate always read empty-string. +Sentinel files survive subprocess spawn because the file system is the +shared substrate. + +### Sentinel paths + +| Path | Effect when present | +|---|---| +| `/run/nvllm/zero_ssm_on_realloc.enabled` | SSM zero-on-realloc fires | +| `/run/nvllm/kv_zero_for_mamba_ids.enabled` | KV `new_block_ids` channel relaxed for MambaSpec allocations | + +The KV channel relax is included in the overlay for completeness but is +NOT shipped in the production patch — the 2026-05-15 4-arm sweep showed it +introduces a deterministic -1 question on `kv_only` (47/50 × 5 vs 48/50 × +5 on `both`, `neither`, `ssm_only`). + +### Per-arm signature + +When the harness runs each arm, the docker log triad proves which gates +fired: + +| Event | Meaning | +|---|---| +| `nvllm.ablation.sentinel_check name= path=

exists= enabled=` | One per gate per worker process, at first call | +| `nvllm.ablation.first_fire name= n_block_ids=` | One per gate the first time the patched branch fires | +| `nvllm.ablation.fire_count name= count=` | Every 100th fire | + +`verdict.json` per arm includes: +- `harness_validation.pass` — false if SSM_sentinel=1 but first_fire=0 + (or vice versa), per gate +- `harness_pass` — top-level boolean mirror + +### Reproducing the 2026-05-15 sweep + +```bash +# Build a sentinel-overlaid scratch checkout (~5 sec). +scripts/ablation/prepare_sentinel_overlay.sh /tmp/nvllm-ssm-sentinel-patched + +# Run the 4-arm sweep (~3 h with default 5 runs × 4 arms × ~15 min/run). +scripts/ablation/run_ssm_ablation_suite.sh + +# Produce ANALYSIS.md from the per-arm verdicts. +.venv/bin/python scripts/ablation/ssm_ablation_compare.py /tmp/ssm_ablation_suite +``` + +Env overrides for the runner are documented in the script header. + +## Evidence + +The 2026-05-15 evidence dir lives at: + +``` +benchmarks/nvllm/traces/ssm_zero_on_realloc/2026-05-15-sentinel-ablation/ +``` + +See its `summary.md` for the per-arm verdict table, host/image manifest, +and what the run did and did not prove. + +## Related memory + +- `project_beta_coop_sustained_collapse` — the closed bisection arc +- `project_mamba_ssm_lifecycle` — the lifecycle audit that scoped this fix +- `feedback_substrate_not_cherry_pick` — methodology lesson from D2.x +- `feedback_vllm_enginecore_env_strip` — why env vars aren't reliable +- `feedback_default_vs_base_path_coverage` — why we keep the harness diff --git a/scripts/ablation/prepare_sentinel_overlay.sh b/scripts/ablation/prepare_sentinel_overlay.sh new file mode 100755 index 000000000000..4e15b2fdc904 --- /dev/null +++ b/scripts/ablation/prepare_sentinel_overlay.sh @@ -0,0 +1,46 @@ +#!/usr/bin/env bash +# Build a sentinel-overlaid scratch checkout for run_ssm_ablation_suite.sh. +# +# Usage: +# scripts/ablation/prepare_sentinel_overlay.sh [SCRATCH_DIR] +# +# Defaults SCRATCH_DIR to /tmp/nvllm-ssm-sentinel-patched. +# Clones the current repo HEAD into SCRATCH_DIR, applies the sentinel +# overlay patch, and verifies the marker strings landed. +# +# The runner expects $PATCHED_REPO to point at SCRATCH_DIR. + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +REPO_ROOT="$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel)" +SCRATCH_DIR="${1:-/tmp/nvllm-ssm-sentinel-patched}" +OVERLAY="$SCRIPT_DIR/ssm_sentinel_overlay.patch" + +if [ ! -f "$OVERLAY" ]; then + echo "ERROR: overlay patch missing: $OVERLAY" >&2 + exit 1 +fi + +if [ -e "$SCRATCH_DIR" ]; then + echo "INFO: removing existing $SCRATCH_DIR" + rm -rf "$SCRATCH_DIR" +fi + +CURRENT_SHA="$(git -C "$REPO_ROOT" rev-parse HEAD)" +echo "cloning $REPO_ROOT @ $CURRENT_SHA -> $SCRATCH_DIR" +git clone --no-local "$REPO_ROOT" "$SCRATCH_DIR" >/dev/null +git -C "$SCRATCH_DIR" checkout --detach "$CURRENT_SHA" >/dev/null 2>&1 + +echo "applying $OVERLAY" +git -C "$SCRATCH_DIR" apply "$OVERLAY" + +# Verify markers. +SSM_HITS=$(grep -c _SSM_ZERO_SENTINEL "$SCRATCH_DIR/vllm/v1/worker/utils.py" || echo 0) +KV_HITS=$(grep -c _KV_ZERO_SENTINEL "$SCRATCH_DIR/vllm/v1/core/single_type_kv_cache_manager.py" || echo 0) +if [ "$SSM_HITS" -lt 1 ] || [ "$KV_HITS" -lt 1 ]; then + echo "ERROR: sentinel markers missing after overlay (SSM=$SSM_HITS KV=$KV_HITS)" >&2 + exit 1 +fi + +echo "done: PATCHED_REPO=$SCRATCH_DIR ready" +echo "next: PATCHED_REPO=$SCRATCH_DIR scripts/ablation/run_ssm_ablation_suite.sh" diff --git a/scripts/ablation/run_ssm_ablation_suite.sh b/scripts/ablation/run_ssm_ablation_suite.sh new file mode 100755 index 000000000000..8a072858e76a --- /dev/null +++ b/scripts/ablation/run_ssm_ablation_suite.sh @@ -0,0 +1,552 @@ +#!/usr/bin/env bash +# Sentinel-gated 4-arm SSM zero-on-realloc ablation suite. +# +# Per-arm bind-mounts a per-arm sentinel dir at /run/nvllm :ro into the +# container; the sentinel-gated overlay reads filesystem-existence as the +# toggle (env vars are stripped by vLLM EngineCore subprocess spawn — see +# memory:feedback_vllm_enginecore_env_strip). +# +# Sentinel files (presence == ENABLED, absence == DISABLED): +# /run/nvllm/zero_ssm_on_realloc.enabled +# /run/nvllm/kv_zero_for_mamba_ids.enabled +# +# Execution proof comes from a docker-log triad emitted by the sentinel +# overlay (apply scripts/ablation/ssm_sentinel_overlay.patch to a clean +# checkout to build $PATCHED_REPO): +# nvllm.ablation.sentinel_check name= exists= enabled= +# nvllm.ablation.first_fire name= n_block_ids= +# nvllm.ablation.fire_count name= count= +# +# Arm matrix: +# both - both sentinels present (full patch active) +# neither - no sentinels (baseline) +# ssm_only - SSM sentinel only (mamba zeroer only) +# kv_only - KV sentinel only (KV new-block-ids channel relax only) +# +# Usage: +# scripts/ablation/run_ssm_ablation_suite.sh # default 4 arms x 5 runs +# scripts/ablation/run_ssm_ablation_suite.sh --force # overwrite OUT_DIR +# +# Env overrides: +# OUT_DIR default /tmp/ssm_ablation_suite +# NVLLM_IMAGE default nvllm:gb10 +# REPO_ROOT default git toplevel of this script +# PATCHED_REPO default /tmp/nvllm-ssm-sentinel-patched +# (must contain the sentinel overlay applied to a clean +# checkout; see scripts/ablation/ssm_sentinel_overlay.patch) +# SENTINELS_ROOT default /tmp/nvllm-ablation-sentinels +# N_RUNS default 5 +# GSM8K_FLOOR default 45 +# CONTAINER default nvllm-ssm-ablation +# READY_TIMEOUT_S default 600 + +set +u +FORCE=0 +for arg in "$@"; do + case "$arg" in + --force) FORCE=1 ;; + *) echo "ERROR: unknown argument: $arg (only --force is accepted)" >&2; exit 64 ;; + esac +done + +# --------------------------------------------------------------------------- +# Defaults / inputs. Resolved BEFORE set -e per memory:feedback_bash_runner_patterns. +# --------------------------------------------------------------------------- +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +OUT_DIR="${OUT_DIR:-/tmp/ssm_ablation_suite}" +NVLLM_IMAGE="${NVLLM_IMAGE:-nvllm:gb10}" +REPO_ROOT="${REPO_ROOT:-$(git -C "$SCRIPT_DIR" rev-parse --show-toplevel 2>/dev/null || echo "")}" +PATCHED_REPO="${PATCHED_REPO:-/tmp/nvllm-ssm-sentinel-patched}" +SENTINELS_ROOT="${SENTINELS_ROOT:-/tmp/nvllm-ablation-sentinels}" +N_RUNS="${N_RUNS:-5}" +GSM8K_N="${GSM8K_N:-50}" +GSM8K_SEED="${GSM8K_SEED:-42}" +GSM8K_MAX_TOKENS="${GSM8K_MAX_TOKENS:-512}" +GSM8K_TIMEOUT="${GSM8K_TIMEOUT:-600}" +GSM8K_FLOOR="${GSM8K_FLOOR:-45}" +CONTAINER="${CONTAINER:-nvllm-ssm-ablation}" +READY_TIMEOUT_S="${READY_TIMEOUT_S:-600}" +API="http://localhost:8000/v1" +METRICS_URL="http://localhost:8000/metrics" +HF_MODEL="${HF_MODEL:-ig1/Qwen3.5-27B-NVFP4}" +SERVED_NAME="${SERVED_NAME:-default}" + +# --------------------------------------------------------------------------- +# Validate inputs BEFORE set -e. +# --------------------------------------------------------------------------- +if [ -z "$REPO_ROOT" ] || ! git -C "$REPO_ROOT" rev-parse --git-dir >/dev/null 2>&1; then + echo "ERROR: REPO_ROOT='$REPO_ROOT' is not a git working tree" >&2; exit 1 +fi +if ! docker image inspect "$NVLLM_IMAGE" >/dev/null 2>&1; then + echo "ERROR: docker image '$NVLLM_IMAGE' not found" >&2; exit 1 +fi +for f in vllm/v1/worker/utils.py vllm/v1/worker/gpu_model_runner.py vllm/v1/core/single_type_kv_cache_manager.py; do + if [ ! -f "$PATCHED_REPO/$f" ]; then + echo "ERROR: patched file missing: $PATCHED_REPO/$f" >&2 + echo " Did you apply scripts/ablation/ssm_sentinel_overlay.patch to a clean checkout at PATCHED_REPO?" >&2 + exit 1 + fi +done +GSM8K_SCRIPT="$REPO_ROOT/scripts/gsm8k_eval_50.py" +if [ ! -f "$GSM8K_SCRIPT" ]; then + echo "ERROR: gsm8k_eval_50.py missing at $GSM8K_SCRIPT" >&2; exit 1 +fi +if [ ! -x "$REPO_ROOT/.venv/bin/python" ]; then + echo "ERROR: $REPO_ROOT/.venv/bin/python not found" >&2; exit 1 +fi +if ! grep -q -- '--run-index' "$GSM8K_SCRIPT" || ! grep -q -- '--metrics-url' "$GSM8K_SCRIPT"; then + echo "ERROR: $GSM8K_SCRIPT missing --run-index / --metrics-url (not instrumented)" >&2; exit 1 +fi +# Smoke-test that the patched files actually contain the sentinel markers. +if ! grep -q "_SSM_ZERO_SENTINEL" "$PATCHED_REPO/vllm/v1/worker/utils.py"; then + echo "ERROR: $PATCHED_REPO/vllm/v1/worker/utils.py missing _SSM_ZERO_SENTINEL marker (overlay not applied?)" >&2; exit 1 +fi +if ! grep -q "_KV_ZERO_SENTINEL" "$PATCHED_REPO/vllm/v1/core/single_type_kv_cache_manager.py"; then + echo "ERROR: $PATCHED_REPO/vllm/v1/core/single_type_kv_cache_manager.py missing _KV_ZERO_SENTINEL marker (overlay not applied?)" >&2; exit 1 +fi + +# Refuse stale OUT_DIR unless --force. +if [ -d "$OUT_DIR" ] && [ "$(ls -A "$OUT_DIR" 2>/dev/null)" ] && [ "$FORCE" -ne 1 ]; then + echo "ERROR: $OUT_DIR is non-empty (rerun with --force to overwrite)" >&2; exit 1 +fi +rm -rf "$OUT_DIR" +mkdir -p "$OUT_DIR" + +set -euo pipefail +log() { printf '[%(%Y-%m-%d %H:%M:%S)T] %s\n' -1 "$*"; } + +# Counter helper: awk-based count of lines matching a literal substring. +# Replaces `grep -c PATTERN file || echo 0`, which emitted "0\n0" when grep +# found zero matches (grep prints "0" + exits 1, triggering the || fallback). +count_substr() { + local pattern="$1" + local file="$2" + if [ ! -f "$file" ]; then + printf '0' + return + fi + awk -v pat="$pattern" 'index($0, pat) { n++ } END { print n+0 }' "$file" +} + +# --------------------------------------------------------------------------- +# Per-arm sentinel directories. SENTINELS_ROOT is rebuilt every run so we +# can be sure no stray sentinel from a prior arm leaks in. +# --------------------------------------------------------------------------- +rm -rf "$SENTINELS_ROOT" +mkdir -p "$SENTINELS_ROOT"/{both,neither,ssm_only,kv_only} +touch "$SENTINELS_ROOT/both/zero_ssm_on_realloc.enabled" +touch "$SENTINELS_ROOT/both/kv_zero_for_mamba_ids.enabled" +touch "$SENTINELS_ROOT/ssm_only/zero_ssm_on_realloc.enabled" +touch "$SENTINELS_ROOT/kv_only/kv_zero_for_mamba_ids.enabled" +# 'neither/' stays empty by design. + +log "sentinel dirs prepared:" +for arm in both neither ssm_only kv_only; do + files=$(ls "$SENTINELS_ROOT/$arm" 2>/dev/null | tr '\n' ',' | sed 's/,$//') + log " $SENTINELS_ROOT/$arm = [$files]" +done + +# Common bind-mounts (patch files are pre-built in $PATCHED_REPO; no apply step). +PATCHED_FILES=( + "vllm/v1/core/single_type_kv_cache_manager.py" + "vllm/v1/worker/utils.py" + "vllm/v1/worker/gpu_model_runner.py" +) +BIND_MOUNTS=() +for f in "${PATCHED_FILES[@]}"; do + BIND_MOUNTS+=(-v "$PATCHED_REPO/$f:/app/nvllm/$f") +done + +GIT_SHA="$(git -C "$REPO_ROOT" rev-parse HEAD)" +IMAGE_ID="$(docker images --format '{{.Repository}}:{{.Tag}}@{{.ID}}' "$NVLLM_IMAGE" | head -n1)" +IMAGE_DIGEST="$(docker inspect --format '{{index .RepoDigests 0}}' "$NVLLM_IMAGE" 2>/dev/null || true)" +IMAGE_DIGEST="${IMAGE_DIGEST:-no-digest}" +HOST_DRIVER="$(nvidia-smi --query-gpu=driver_version --format=csv,noheader 2>/dev/null | head -n1 || echo "unknown")" +HOST_KERNEL="$(uname -r 2>/dev/null || echo unknown)" +HOST_NAME="$(hostname 2>/dev/null || echo unknown)" +# Deterministic prompt-set identifier: (n, seed, model, served-name). +PROMPT_SET_HASH="$(printf '%s|%s|%s|%s' "$GSM8K_N" "$GSM8K_SEED" "$HF_MODEL" "$SERVED_NAME" | sha256sum | awk '{print $1}')" + +# One-time runner manifest written before any arm runs. +{ + echo "{" + echo " \"runner\": \"$0\"," + echo " \"started_utc\": \"$(date -u +%Y-%m-%dT%H:%M:%SZ)\"," + echo " \"git_sha\": \"$GIT_SHA\"," + echo " \"image\": \"$NVLLM_IMAGE\"," + echo " \"image_id\": \"$IMAGE_ID\"," + echo " \"image_digest\": \"$IMAGE_DIGEST\"," + echo " \"patched_repo\": \"$PATCHED_REPO\"," + echo " \"sentinels_root\": \"$SENTINELS_ROOT\"," + echo " \"host_name\": \"$HOST_NAME\"," + echo " \"host_driver\": \"$HOST_DRIVER\"," + echo " \"host_kernel\": \"$HOST_KERNEL\"," + echo " \"gsm8k_n\": $GSM8K_N," + echo " \"gsm8k_seed\": $GSM8K_SEED," + echo " \"gsm8k_max_tokens\": $GSM8K_MAX_TOKENS," + echo " \"prompt_set_hash\": \"$PROMPT_SET_HASH\"," + echo " \"hf_model\": \"$HF_MODEL\"," + echo " \"n_runs\": $N_RUNS," + echo " \"arms\": [\"both\", \"neither\", \"ssm_only\", \"kv_only\"]" + echo "}" +} > "$OUT_DIR/runner_manifest.json" +log "runner manifest: $OUT_DIR/runner_manifest.json" + +# --------------------------------------------------------------------------- +# Arm matrix. +# --------------------------------------------------------------------------- +ARM_NAMES=(both neither ssm_only kv_only) +declare -A ARM_SSM=( [both]=1 [neither]=0 [ssm_only]=1 [kv_only]=0 ) +declare -A ARM_KV=( [both]=1 [neither]=0 [ssm_only]=0 [kv_only]=1 ) +declare -a ARM_GATE_PASS +declare -a ARM_GIT_SUMMARY + +for arm_idx in "${!ARM_NAMES[@]}"; do + ARM="${ARM_NAMES[$arm_idx]}" + SSM_VAL="${ARM_SSM[$ARM]}" + KV_VAL="${ARM_KV[$ARM]}" + ARM_DIR="$OUT_DIR/$ARM" + ARM_SENTINEL_DIR="$SENTINELS_ROOT/$ARM" + mkdir -p "$ARM_DIR" + log "========================================================================" + log "ARM $((arm_idx + 1))/4: $ARM (SSM_sentinel=$SSM_VAL, KV_sentinel=$KV_VAL)" + log "========================================================================" + + arm_files=$(ls "$ARM_SENTINEL_DIR" 2>/dev/null | tr '\n' ',' | sed 's/,$//') + log "nvllm.ablation.arm=$ARM host_sentinels_dir=$ARM_SENTINEL_DIR container_sentinels_dir=/run/nvllm files=[$arm_files]" + + docker rm -f "$CONTAINER" >/dev/null 2>&1 || true + + HOST_TRACE_DIR="$ARM_DIR/trace" + mkdir -p "$HOST_TRACE_DIR" + CONT_TRACE_PATH="/tmp/ssm_zero_trace/mamba_slot_trace.jsonl" + SERVE_LOG="$ARM_DIR/serve.log" + + log "boot patched server (arm=$ARM, image=$NVLLM_IMAGE, container=$CONTAINER)" + # shellcheck disable=SC2086 + docker run -d \ + --name "$CONTAINER" \ + --gpus all \ + --ipc=host \ + --network host \ + --shm-size=8g \ + -v "$HOME/.cache/huggingface:/root/.cache/huggingface" \ + -v "$HOME/.cache/flashinfer:/root/.cache/flashinfer" \ + -v "$HOST_TRACE_DIR:/tmp/ssm_zero_trace" \ + -v "$ARM_SENTINEL_DIR:/run/nvllm:ro" \ + "${BIND_MOUNTS[@]}" \ + -e VLLM_NVFP4_GEMM_BACKEND=cutlass \ + -e VLLM_ALLOW_LONG_MAX_MODEL_LEN=1 \ + -e PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True \ + -e NVLLM_MAMBA_SLOT_TRACE="$CONT_TRACE_PATH" \ + -e CUTE_PHASE_E_FUSION=1 \ + -e CUTE_PHASE_E_PATH=auto \ + -e CUTE_PHASE_E_LAYERS="3,7" \ + -e CUTE_PHASE_E_FALLBACK_RAISE=1 \ + -e CUTE_WO_SPLIT=8 \ + "$NVLLM_IMAGE" \ + serve \ + --model "$HF_MODEL" \ + --served-model-name "$SERVED_NAME" \ + --host 0.0.0.0 --port 8000 \ + --gpu-memory-utilization 0.85 \ + > "$SERVE_LOG" 2>&1 + + # Clear stale .pyc from bind-mounted dirs (memory:feedback_docker_bindmount). + sleep 2 + docker exec "$CONTAINER" sh -c ' + find /app/nvllm/vllm/v1/core /app/nvllm/vllm/v1/worker \ + -maxdepth 3 -name "__pycache__" -type d \ + -exec rm -rf {} + 2>/dev/null || true + ' || true + + # Active readiness probe (memory:feedback_active_serve_readiness_probe). + deadline=$((SECONDS + READY_TIMEOUT_S)) + log "wait for ready on $API/models ..." + READY=0 + while [ "$SECONDS" -lt "$deadline" ]; do + if ! docker ps --filter "name=^/${CONTAINER}$" --format '{{.Names}}' | grep -qx "$CONTAINER"; then + echo "ERROR: container died during boot (arm=$ARM); tail $SERVE_LOG" >&2 + docker logs "$CONTAINER" > "$ARM_DIR/docker.log" 2>&1 || true + ARM_GATE_PASS[$arm_idx]="boot_fail" + ARM_GIT_SUMMARY[$arm_idx]="-" + break + fi + if curl -fsS "$API/models" >/dev/null 2>&1; then + if curl -fsS "$API/completions" -H 'Content-Type: application/json' \ + -d '{"model":"'"$SERVED_NAME"'","prompt":"warmup","max_tokens":8,"temperature":0}' \ + >/dev/null 2>&1; then + log "ready (~${SECONDS}s)" + READY=1 + break + fi + fi + sleep 5 + done + if [ "$READY" -ne 1 ]; then + log "WARN: arm=$ARM did not become ready within ${READY_TIMEOUT_S}s" + docker logs "$CONTAINER" > "$ARM_DIR/docker.log" 2>&1 || true + docker rm -f "$CONTAINER" >/dev/null 2>&1 || true + { + echo "{" + echo " \"arm\": \"$ARM\"," + echo " \"ssm_sentinel\": $SSM_VAL," + echo " \"kv_sentinel\": $KV_VAL," + echo " \"git_sha\": \"$GIT_SHA\"," + echo " \"image\": \"$NVLLM_IMAGE\"," + echo " \"image_id\": \"$IMAGE_ID\"," + echo " \"n_runs\": $N_RUNS," + echo " \"gsm8k_floor\": $GSM8K_FLOOR," + echo " \"runs\": []," + echo " \"container_alive_at_end\": false," + echo " \"gate_pass\": false," + echo " \"reason\": \"server_never_ready\"" + echo "}" + } > "$ARM_DIR/verdict.json" + ARM_GATE_PASS[$arm_idx]="not_ready" + ARM_GIT_SUMMARY[$arm_idx]="-" + continue + fi + + # Bind-mount proof: sentinel marker present in patched utils.py inside container. + INSIDE_MARKER=$(docker exec "$CONTAINER" sh -c "grep -c '_SSM_ZERO_SENTINEL' /app/nvllm/vllm/v1/worker/utils.py 2>/dev/null" || printf '0') + INSIDE_MARKER=${INSIDE_MARKER:-0} + if [ "$INSIDE_MARKER" -lt 1 ]; then + echo "FAIL: bind-mount did not land inside container (arm=$ARM)" >&2 + docker logs "$CONTAINER" > "$ARM_DIR/docker.log" 2>&1 || true + docker rm -f "$CONTAINER" >/dev/null 2>&1 || true + ARM_GATE_PASS[$arm_idx]="bind_fail" + ARM_GIT_SUMMARY[$arm_idx]="-" + continue + fi + log "bind-mount verified inside container (marker count=$INSIDE_MARKER)" + + # Sentinel-dir proof: docker exec ls /run/nvllm + SENTINEL_LIST_INSIDE="$(docker exec "$CONTAINER" sh -c 'ls /run/nvllm 2>/dev/null | tr "\n" "," | sed "s/,$//"' || true)" + log "sentinels inside container /run/nvllm = [$SENTINEL_LIST_INSIDE]" + + CONTAINER_ID="$(docker inspect --format '{{.Id}}' "$CONTAINER" 2>/dev/null || echo unknown)" + log "container id: $CONTAINER_ID" + docker inspect "$CONTAINER" > "$ARM_DIR/docker_inspect.json" 2>/dev/null || true + + declare -a RUN_RESULTS=() + + for run_idx in $(seq 1 "$N_RUNS"); do + RUN_DIR="$ARM_DIR/run${run_idx}" + mkdir -p "$RUN_DIR" + log "==> arm=$ARM run ${run_idx}/${N_RUNS} GSM8K-${GSM8K_N}" + set +e + ( cd "$REPO_ROOT" && \ + .venv/bin/python "$GSM8K_SCRIPT" \ + --api "$API" --model "$SERVED_NAME" \ + --n "$GSM8K_N" --seed "$GSM8K_SEED" \ + --max-tokens "$GSM8K_MAX_TOKENS" --timeout "$GSM8K_TIMEOUT" \ + --label "ablation_${ARM}_run${run_idx}" \ + --run-index "$run_idx" \ + --metrics-url "$METRICS_URL" \ + --save "$RUN_DIR/gsm8k.json" ) 2>&1 | tee "$RUN_DIR/gsm8k.log" + RC_GSM="${PIPESTATUS[0]}" + set -e + if [ "$RC_GSM" -ne 0 ]; then + log "WARN: arm=$ARM run ${run_idx} returned rc=$RC_GSM" + fi + if [ ! -f "$RUN_DIR/gsm8k.json" ]; then + log "FAIL: arm=$ARM run ${run_idx} did not produce gsm8k.json" + RUN_RESULTS+=("$run_idx fail no-json") + continue + fi + CORRECT="$("$REPO_ROOT/.venv/bin/python" -c "import json; print(json.load(open('$RUN_DIR/gsm8k.json'))['correct'])")" + ERRORS="$("$REPO_ROOT/.venv/bin/python" -c "import json; print(json.load(open('$RUN_DIR/gsm8k.json'))['errors'])")" + RUN_RESULTS+=("$run_idx $CORRECT $ERRORS") + log "<== arm=$ARM run ${run_idx} correct=$CORRECT errors=$ERRORS" + done + + # Final capture + teardown for this arm. + docker logs "$CONTAINER" > "$ARM_DIR/docker.log" 2>&1 || true + CONTAINER_ALIVE="false" + if docker ps --filter "name=^/${CONTAINER}$" --format '{{.Names}}' | grep -qx "$CONTAINER"; then + CONTAINER_ALIVE="true" + fi + + # Extract ablation event triad from docker logs - this is the execution proof. + ABLATION_EVENTS="$ARM_DIR/ablation_events.log" + grep -E "nvllm.ablation" "$ARM_DIR/docker.log" > "$ABLATION_EVENTS" || true + SENTINEL_CHECK_LINES=$(count_substr "nvllm.ablation.sentinel_check" "$ABLATION_EVENTS") + FIRST_FIRE_LINES=$(count_substr "nvllm.ablation.first_fire" "$ABLATION_EVENTS") + FIRE_COUNT_LINES=$(count_substr "nvllm.ablation.fire_count" "$ABLATION_EVENTS") + # Per-gate breakdown: did SSM gate fire? did KV gate fire? + SSM_FIRST_FIRE=$(count_substr "nvllm.ablation.first_fire name=ssm_zero_on_realloc" "$ABLATION_EVENTS") + KV_FIRST_FIRE=$(count_substr "nvllm.ablation.first_fire name=kv_zero_for_mamba_ids" "$ABLATION_EVENTS") + log "ablation events for $ARM: sentinel_check=$SENTINEL_CHECK_LINES first_fire=$FIRST_FIRE_LINES fire_count=$FIRE_COUNT_LINES (ssm_fire=$SSM_FIRST_FIRE kv_fire=$KV_FIRST_FIRE)" + + # Harness validation gate: enabled => first_fire>=1; disabled => first_fire==0. + HARNESS_PASS="true" + HARNESS_REASON="ok" + if [ "$SSM_VAL" -eq 1 ] && [ "$SSM_FIRST_FIRE" -lt 1 ]; then + HARNESS_PASS="false"; HARNESS_REASON="ssm_enabled_but_no_first_fire" + fi + if [ "$SSM_VAL" -eq 0 ] && [ "$SSM_FIRST_FIRE" -gt 0 ]; then + HARNESS_PASS="false"; HARNESS_REASON="ssm_disabled_but_first_fire_observed" + fi + if [ "$KV_VAL" -eq 1 ] && [ "$KV_FIRST_FIRE" -lt 1 ]; then + HARNESS_PASS="false"; HARNESS_REASON="kv_enabled_but_no_first_fire" + fi + if [ "$KV_VAL" -eq 0 ] && [ "$KV_FIRST_FIRE" -gt 0 ]; then + HARNESS_PASS="false"; HARNESS_REASON="kv_disabled_but_first_fire_observed" + fi + log "harness validation for $ARM: pass=$HARNESS_PASS reason=$HARNESS_REASON" + + docker rm -f "$CONTAINER" >/dev/null 2>&1 || true + + TRACE_FILE="$HOST_TRACE_DIR/mamba_slot_trace.jsonl" + TRACE_LINES=0 + if [ -f "$TRACE_FILE" ]; then + TRACE_LINES=$(wc -l < "$TRACE_FILE" | tr -d ' ') + fi + + # Gate evaluation. + ALL_PASS="true" + for line in "${RUN_RESULTS[@]}"; do + read -r idx correct errors <<< "$line" + if [ "$correct" = "fail" ]; then ALL_PASS="false"; continue; fi + if [ "$correct" -lt "$GSM8K_FLOOR" ]; then ALL_PASS="false"; fi + if [ "$errors" -gt 0 ]; then ALL_PASS="false"; fi + done + + CORRUPT_HITS=0 + if [ -f "$ARM_DIR/docker.log" ]; then + CORRUPT_HITS=$(awk '/ERROR|FATAL|state.*corrupt/{n++} END{print n+0}' "$ARM_DIR/docker.log") + fi + [ "$CORRUPT_HITS" -gt 0 ] && ALL_PASS="false" + [ "$CONTAINER_ALIVE" != "true" ] && ALL_PASS="false" + + # Per-arm token summary. + PERQ_FILE="$ARM_DIR/perq.jsonl" + rm -f "$PERQ_FILE" + for run_idx in $(seq 1 "$N_RUNS"); do + if [ -f "$ARM_DIR/run${run_idx}/perq.jsonl" ]; then + cat "$ARM_DIR/run${run_idx}/perq.jsonl" >> "$PERQ_FILE" + fi + done + TOKEN_SUMMARY="$($REPO_ROOT/.venv/bin/python - < 0: + walls.append(w) + if d > 0: + decode_rates.append(d) +except FileNotFoundError: + pass +print(json.dumps({ + "n_questions": count, + "sum_completion_tokens": sumc, + "sum_prompt_tokens": sump, + "median_wall_time_s": (statistics.median(walls) if walls else 0.0), + "median_decode_tok_s": (statistics.median(decode_rates) if decode_rates else 0.0), +})) +EOF +)" + + { + echo "{" + echo " \"arm\": \"$ARM\"," + echo " \"ssm_sentinel\": $SSM_VAL," + echo " \"kv_sentinel\": $KV_VAL," + echo " \"hypothesis\": \"ssm_zero_on_realloc_ablation_sentinel_gated\"," + echo " \"patched_repo\": \"$PATCHED_REPO\"," + echo " \"sentinel_dir\": \"$ARM_SENTINEL_DIR\"," + echo " \"sentinel_files_inside\": \"$SENTINEL_LIST_INSIDE\"," + echo " \"container_id\": \"$CONTAINER_ID\"," + echo " \"host_driver\": \"$HOST_DRIVER\"," + echo " \"prompt_set_hash\": \"$PROMPT_SET_HASH\"," + echo " \"harness_validation\": {\"pass\": $HARNESS_PASS, \"reason\": \"$HARNESS_REASON\", \"ssm_first_fire\": $SSM_FIRST_FIRE, \"kv_first_fire\": $KV_FIRST_FIRE}," + echo " \"git_sha\": \"$GIT_SHA\"," + echo " \"image\": \"$NVLLM_IMAGE\"," + echo " \"image_id\": \"$IMAGE_ID\"," + echo " \"phase_e_layers\": \"3,7\"," + echo " \"phase_e_fusion\": 1," + echo " \"phase_e_path\": \"auto\"," + echo " \"wo_split\": 8," + echo " \"n_runs\": $N_RUNS," + echo " \"gsm8k_floor\": $GSM8K_FLOOR," + echo " \"mamba_slot_trace_lines\": $TRACE_LINES," + echo " \"ablation_events\": {\"sentinel_check\": $SENTINEL_CHECK_LINES, \"first_fire\": $FIRST_FIRE_LINES, \"fire_count\": $FIRE_COUNT_LINES}," + echo " \"token_summary\": $TOKEN_SUMMARY," + echo " \"runs\": [" + first=1 + for line in "${RUN_RESULTS[@]}"; do + read -r idx correct errors <<< "$line" + [ "$first" -eq 0 ] && echo "," || true + first=0 + if [ "$correct" = "fail" ]; then + echo -n " {\"run\": $idx, \"ok\": false, \"reason\": \"no_gsm8k_json\"}" + else + pass=true + [ "$correct" -lt "$GSM8K_FLOOR" ] && pass=false + [ "$errors" -gt 0 ] && pass=false + echo -n " {\"run\": $idx, \"correct\": $correct, \"errors\": $errors, \"pass\": $pass}" + fi + done + echo + echo " ]," + echo " \"container_alive_at_end\": $CONTAINER_ALIVE," + echo " \"docker_log_corruption_hits\": $CORRUPT_HITS," + echo " \"gate_pass\": $ALL_PASS," + echo " \"harness_pass\": $HARNESS_PASS" + echo "}" + } > "$ARM_DIR/verdict.json" + + ARM_GATE_PASS[$arm_idx]="$ALL_PASS" + SUM="" + for line in "${RUN_RESULTS[@]}"; do + read -r idx correct errors <<< "$line" + SUM+="${correct}," + done + ARM_GIT_SUMMARY[$arm_idx]="${SUM%,}" + + log "<== arm=$ARM complete (gate_pass=$ALL_PASS, trace_lines=$TRACE_LINES, sentinel_check=$SENTINEL_CHECK_LINES, first_fire=$FIRST_FIRE_LINES, runs=$SUM)" +done + +# --------------------------------------------------------------------------- +# Aggregate comparison.json +# --------------------------------------------------------------------------- +{ + echo "{" + echo " \"out_dir\": \"$OUT_DIR\"," + echo " \"git_sha\": \"$GIT_SHA\"," + echo " \"image\": \"$NVLLM_IMAGE\"," + echo " \"n_runs\": $N_RUNS," + echo " \"gsm8k_floor\": $GSM8K_FLOOR," + echo " \"patched_repo\": \"$PATCHED_REPO\"," + echo " \"sentinels_root\": \"$SENTINELS_ROOT\"," + echo " \"arms\": [" + first=1 + for arm_idx in "${!ARM_NAMES[@]}"; do + ARM="${ARM_NAMES[$arm_idx]}" + [ "$first" -eq 0 ] && echo "," || true + first=0 + GP="${ARM_GATE_PASS[$arm_idx]:-unknown}" + SUM="${ARM_GIT_SUMMARY[$arm_idx]:-unknown}" + echo -n " {\"arm\": \"$ARM\", \"ssm_sentinel\": ${ARM_SSM[$ARM]}, \"kv_sentinel\": ${ARM_KV[$ARM]}, \"gate_pass\": \"$GP\", \"correct_per_run\": \"$SUM\", \"verdict\": \"$OUT_DIR/$ARM/verdict.json\"}" + done + echo + echo " ]" + echo "}" +} > "$OUT_DIR/comparison.json" + +log "ablation suite complete" +log "comparison: $OUT_DIR/comparison.json" +exit 0 diff --git a/scripts/ablation/ssm_ablation_compare.py b/scripts/ablation/ssm_ablation_compare.py new file mode 100755 index 000000000000..3e6920e22f85 --- /dev/null +++ b/scripts/ablation/ssm_ablation_compare.py @@ -0,0 +1,423 @@ +""" +SSM zero-on-realloc ablation comparison tool. + +Reads each arm's verdict.json + per-question JSONL trace, emits a markdown +comparison at /ANALYSIS.md with: + - Verdict table (arm x run x correct/errors/gate_pass) + - Q1-Q50 per-question table for Run 4 (collapse window) across all 4 arms: + latency_s, completion_tokens, decode_tok_s, finish_reason, correct + - Aggregate per-arm steady-state stats: median decode_tok_s, p95 latency, + mean completion_tokens + - Friend's interpretation thresholds applied: which arm matches "real + pipeline win" vs "shortened generations" + - Drained KV invariant check from /metrics pre vs post snapshot. + +Usage: + python3 /tmp/ssm_ablation_compare.py [OUT_DIR] + +Default OUT_DIR: /tmp/ssm_ablation_suite + +Reads: + //verdict.json + //perq.jsonl (concatenated by runner) + //run/perq.jsonl (per-run, used for Run-4 table) + //run/metrics_*.json (pre / q10..q50 / post snapshots) + +Writes: + /ANALYSIS.md +""" + +from __future__ import annotations + +import json +import os +import statistics +import sys +from typing import Any + +# Arm presentation order = same as the runner. +ARM_ORDER = ("both", "neither", "ssm_only", "kv_only") +RUN_INDICES = (1, 2, 3, 4, 5) +COLLAPSE_RUN = 4 # the friend's collapse-window pin +METRIC_KEY_KV_USAGE = "vllm:kv_cache_usage_perc" +METRIC_KEY_KV_USAGE_TOL = 0.05 # 5 percentage-point tolerance for "drained" + +# Friend's interpretation thresholds. +# - "real pipeline win" = decode_tok_s materially higher AND completion_tokens +# not shortened relative to neither/baseline. +# - "shortened generations" = decode_tok_s higher BUT mean completion_tokens +# notably lower (typical max_tokens=512 with finish_reason=length disappearing). +TPOT_WIN_RATIO = 1.30 # >=30% decode_tok_s vs baseline = "win" +SHORTEN_RATIO = 0.85 # <=85% of baseline mean completion_tokens = "shortened" + + +def _load_json(path: str) -> Any: + try: + with open(path) as f: + return json.load(f) + except Exception: + return None + + +def _load_jsonl(path: str) -> list[dict]: + out: list[dict] = [] + try: + with open(path) as f: + for line in f: + line = line.strip() + if not line: + continue + try: + out.append(json.loads(line)) + except Exception: + continue + except FileNotFoundError: + pass + return out + + +def _p95(values: list[float]) -> float: + if not values: + return 0.0 + s = sorted(values) + idx = max(0, int(round(0.95 * (len(s) - 1)))) + return s[idx] + + +def _fmt(v: Any, prec: int = 2) -> str: + if v is None: + return "-" + if isinstance(v, float): + return f"{v:.{prec}f}" + return str(v) + + +def _arm_stats(perq_records: list[dict]) -> dict: + if not perq_records: + return { + "n_questions": 0, + "median_decode_tok_s": 0.0, + "p95_wall_time_s": 0.0, + "mean_completion_tokens": 0.0, + "finish_reason_counts": {}, + } + decode_rates = [ + float(r.get("decode_tok_s", 0) or 0) + for r in perq_records + if float(r.get("decode_tok_s", 0) or 0) > 0 + ] + walls = [ + float(r.get("wall_time_s", 0) or 0) + for r in perq_records + if float(r.get("wall_time_s", 0) or 0) > 0 + ] + comp_tokens = [int(r.get("completion_tokens", 0) or 0) for r in perq_records] + finish_reasons: dict[str, int] = {} + for r in perq_records: + fr = str(r.get("finish_reason")) + finish_reasons[fr] = finish_reasons.get(fr, 0) + 1 + return { + "n_questions": len(perq_records), + "median_decode_tok_s": (statistics.median(decode_rates) if decode_rates else 0.0), + "p95_wall_time_s": _p95(walls), + "mean_completion_tokens": (statistics.mean(comp_tokens) if comp_tokens else 0.0), + "finish_reason_counts": finish_reasons, + } + + +def _drained_invariant(metric_pre: dict, metric_post: dict) -> dict: + """Did KV usage % return to baseline at the post snapshot? + + Returns dict with pre, post, delta_pp, drained (bool). + """ + if not isinstance(metric_pre, dict) or not isinstance(metric_post, dict): + return {"pre": None, "post": None, "delta_pp": None, "drained": None} + m_pre = (metric_pre or {}).get("metrics", {}) or {} + m_post = (metric_post or {}).get("metrics", {}) or {} + pre_val = m_pre.get(METRIC_KEY_KV_USAGE) + post_val = m_post.get(METRIC_KEY_KV_USAGE) + if pre_val is None or post_val is None: + return {"pre": pre_val, "post": post_val, "delta_pp": None, "drained": None} + try: + delta = float(post_val) - float(pre_val) + except (TypeError, ValueError): + return {"pre": pre_val, "post": post_val, "delta_pp": None, "drained": None} + return { + "pre": float(pre_val), + "post": float(post_val), + "delta_pp": delta, + "drained": abs(delta) <= METRIC_KEY_KV_USAGE_TOL, + } + + +def _load_arm(out_dir: str, arm: str) -> dict: + arm_dir = os.path.join(out_dir, arm) + verdict = _load_json(os.path.join(arm_dir, "verdict.json")) or {} + perq_concat_path = os.path.join(arm_dir, "perq.jsonl") + perq_concat = _load_jsonl(perq_concat_path) + + # Per-run breakouts. + runs: dict[int, dict] = {} + for run_idx in RUN_INDICES: + run_dir = os.path.join(arm_dir, f"run{run_idx}") + gsm = _load_json(os.path.join(run_dir, "gsm8k.json")) + perq = _load_jsonl(os.path.join(run_dir, "perq.jsonl")) + runs[run_idx] = {"gsm8k": gsm, "perq": perq, "dir": run_dir} + + # /metrics snapshots: typically Run 1 holds the pre/post pair. We aggregate + # from each run's own pre/post if they exist (the eval writes them next to + # the run's gsm8k.json). + arm_metrics = {} + for run_idx in RUN_INDICES: + run_dir = runs[run_idx]["dir"] + arm_metrics[run_idx] = { + "pre": _load_json(os.path.join(run_dir, "metrics_pre.json")), + "post": _load_json(os.path.join(run_dir, "metrics_post.json")), + } + for tag in ("q10", "q20", "q30", "q40", "q50"): + snap = _load_json(os.path.join(run_dir, f"metrics_{tag}.json")) + if snap is not None: + arm_metrics[run_idx][tag] = snap + + return { + "verdict": verdict, + "perq_concat": perq_concat, + "runs": runs, + "metrics": arm_metrics, + } + + +def _interpretation(stats_by_arm: dict[str, dict]) -> dict[str, str]: + """Apply friend's win/shortened thresholds, with 'neither' as baseline.""" + out: dict[str, str] = {} + baseline = stats_by_arm.get("neither", {}) + base_decode = float(baseline.get("median_decode_tok_s") or 0.0) or None + base_compt = float(baseline.get("mean_completion_tokens") or 0.0) or None + + for arm in ARM_ORDER: + s = stats_by_arm.get(arm, {}) + decode = float(s.get("median_decode_tok_s") or 0.0) + compt = float(s.get("mean_completion_tokens") or 0.0) + if base_decode is None or base_compt is None or arm == "neither": + verdict = "baseline" if arm == "neither" else "no baseline available" + out[arm] = verdict + continue + decode_ratio = decode / base_decode if base_decode > 0 else 0.0 + compt_ratio = compt / base_compt if base_compt > 0 else 0.0 + if decode_ratio >= TPOT_WIN_RATIO and compt_ratio >= SHORTEN_RATIO: + verdict = ( + f"REAL pipeline win " + f"(decode {decode_ratio:.2f}x baseline, compt {compt_ratio:.2f}x)" + ) + elif decode_ratio >= TPOT_WIN_RATIO and compt_ratio < SHORTEN_RATIO: + verdict = ( + f"SHORTENED generations - speed inflated " + f"(decode {decode_ratio:.2f}x, compt only {compt_ratio:.2f}x)" + ) + elif decode_ratio < TPOT_WIN_RATIO and compt_ratio >= SHORTEN_RATIO: + verdict = ( + f"no decode win vs baseline " + f"(decode {decode_ratio:.2f}x, compt {compt_ratio:.2f}x)" + ) + else: + verdict = ( + f"no win + shortened " + f"(decode {decode_ratio:.2f}x, compt {compt_ratio:.2f}x)" + ) + out[arm] = verdict + return out + + +def _render_verdict_table(arms_data: dict[str, dict]) -> str: + """Verdict table: arm x run x correct/errors/gate_pass.""" + lines: list[str] = [] + lines.append("| Arm | SSM | KV | Run 1 | Run 2 | Run 3 | Run 4 | Run 5 | Gate |") + lines.append("|-----|-----|----|-------|-------|-------|-------|-------|------|") + for arm in ARM_ORDER: + d = arms_data.get(arm, {}) + v = d.get("verdict", {}) + ssm = v.get("ssm_zero_on_realloc", v.get("ssm_sentinel", "?")) + kv = v.get("kv_zero_for_mamba_ids", v.get("kv_sentinel", "?")) + gate = v.get("gate_pass", "?") + cells: list[str] = [] + runs_arr = v.get("runs") or [] + run_by_idx = {int(r.get("run", -1)): r for r in runs_arr if isinstance(r, dict)} + for run_idx in RUN_INDICES: + r = run_by_idx.get(run_idx) + if r is None: + cells.append("-") + continue + if "correct" not in r: + cells.append(f"FAIL({r.get('reason', '?')})") + continue + cells.append(f"{r['correct']}/{r.get('errors', 0)}err") + lines.append( + "| " + " | ".join([arm, str(ssm), str(kv), *cells, str(gate)]) + " |" + ) + return "\n".join(lines) + + +def _render_run_table(arms_data: dict[str, dict], run_idx: int) -> str: + """Per-question table for one run across all 4 arms. Columns: + Q | :lat | :ct | :dtok/s | :fr | :ok + """ + arm_perq: dict[str, list[dict]] = { + arm: arms_data.get(arm, {}).get("runs", {}).get(run_idx, {}).get("perq") or [] + for arm in ARM_ORDER + } + # Build index by prompt_index per arm. + indexed: dict[str, dict[int, dict]] = { + arm: {int(r.get("prompt_index", -1)): r for r in arm_perq[arm]} + for arm in ARM_ORDER + } + # Union of seen prompt indices, sorted. + all_qs: set[int] = set() + for arm in ARM_ORDER: + all_qs.update(indexed[arm].keys()) + if not all_qs: + return "_(no per-Q records for run "f"{run_idx}_)" + + header_cells = ["Q"] + for arm in ARM_ORDER: + header_cells += [ + f"{arm}:lat", f"{arm}:ct", f"{arm}:dtok/s", f"{arm}:fr", f"{arm}:ok" + ] + lines: list[str] = [] + lines.append("| " + " | ".join(header_cells) + " |") + lines.append("|" + "|".join(["---"] * len(header_cells)) + "|") + for q in sorted(all_qs): + row = [str(q)] + for arm in ARM_ORDER: + r = indexed[arm].get(q) + if r is None: + row += ["-", "-", "-", "-", "-"] + continue + row += [ + _fmt(r.get("wall_time_s"), 2), + _fmt(r.get("completion_tokens"), 0), + _fmt(r.get("decode_tok_s"), 2), + _fmt(r.get("finish_reason")), + "Y" if r.get("correct") else "N", + ] + lines.append("| " + " | ".join(row) + " |") + return "\n".join(lines) + + +def _render_steady_state(stats_by_arm: dict[str, dict]) -> str: + """Aggregate per-arm steady-state stats.""" + lines: list[str] = [] + lines.append( + "| Arm | N | median dtok/s | p95 wall_s | mean completion_tokens | finish_reason hist |" + ) + lines.append("|-----|---|---------------|------------|------------------------|--------------------|") + for arm in ARM_ORDER: + s = stats_by_arm.get(arm, {}) + fr = s.get("finish_reason_counts", {}) or {} + fr_str = ", ".join(f"{k}={v}" for k, v in sorted(fr.items())) + lines.append( + f"| {arm} | {s.get('n_questions', 0)} | " + f"{_fmt(s.get('median_decode_tok_s'), 2)} | " + f"{_fmt(s.get('p95_wall_time_s'), 2)} | " + f"{_fmt(s.get('mean_completion_tokens'), 1)} | {fr_str} |" + ) + return "\n".join(lines) + + +def _render_drained_section(arms_data: dict[str, dict]) -> str: + lines: list[str] = [] + lines.append( + f"Tolerance: |delta| <= {METRIC_KEY_KV_USAGE_TOL:.2f} (5 pp) " + f"counts as drained." + ) + lines.append("") + lines.append("| Arm | Run | KV pre | KV post | delta | drained |") + lines.append("|-----|-----|--------|---------|-------|---------|") + for arm in ARM_ORDER: + d = arms_data.get(arm, {}) + metrics = d.get("metrics", {}) + for run_idx in RUN_INDICES: + m = metrics.get(run_idx, {}) + di = _drained_invariant(m.get("pre"), m.get("post")) + drained = di.get("drained") + drained_s = "-" if drained is None else ("Y" if drained else "N") + lines.append( + f"| {arm} | {run_idx} | {_fmt(di.get('pre'), 4)} | " + f"{_fmt(di.get('post'), 4)} | {_fmt(di.get('delta_pp'), 4)} | " + f"{drained_s} |" + ) + return "\n".join(lines) + + +def main(): + out_dir = sys.argv[1] if len(sys.argv) > 1 else "/tmp/ssm_ablation_suite" + if not os.path.isdir(out_dir): + print(f"ERROR: OUT_DIR not found: {out_dir}", file=sys.stderr) + sys.exit(2) + + arms_data: dict[str, dict] = { + arm: _load_arm(out_dir, arm) for arm in ARM_ORDER + } + # Steady-state stats per arm computed on perq concat. + stats_by_arm = { + arm: _arm_stats(arms_data[arm].get("perq_concat") or []) + for arm in ARM_ORDER + } + interp = _interpretation(stats_by_arm) + + md: list[str] = [] + md.append("# SSM zero-on-realloc ablation: 4-arm comparison") + md.append("") + md.append(f"- OUT_DIR: `{out_dir}`") + comp = _load_json(os.path.join(out_dir, "comparison.json")) or {} + md.append(f"- git_sha: `{comp.get('git_sha', '?')}`") + md.append(f"- image: `{comp.get('image', '?')}`") + md.append(f"- N runs per arm: {comp.get('n_runs', '?')}") + md.append(f"- gsm8k_floor: {comp.get('gsm8k_floor', '?')}") + md.append("") + md.append("## Verdict table (run x correct/errors)") + md.append("") + md.append(_render_verdict_table(arms_data)) + md.append("") + md.append( + f"## Per-question table - Run {COLLAPSE_RUN} (collapse window)" + ) + md.append("") + md.append( + "Columns per arm: lat (wall_time_s), ct (completion_tokens), " + "dtok/s (decode_tok_s), fr (finish_reason), ok (correct)." + ) + md.append("") + md.append(_render_run_table(arms_data, COLLAPSE_RUN)) + md.append("") + md.append("## Aggregate per-arm steady-state stats (concat across runs)") + md.append("") + md.append(_render_steady_state(stats_by_arm)) + md.append("") + md.append("## Friend's interpretation thresholds applied") + md.append("") + md.append( + f"- 'real pipeline win' iff median decode_tok_s >= " + f"{TPOT_WIN_RATIO:.2f}x baseline ('neither') AND mean completion_tokens " + f">= {SHORTEN_RATIO:.2f}x baseline" + ) + md.append( + f"- 'shortened generations' iff decode rate up but completion_tokens " + f"< {SHORTEN_RATIO:.2f}x baseline" + ) + md.append("") + for arm in ARM_ORDER: + md.append(f"- **{arm}**: {interp.get(arm, '-')}") + md.append("") + md.append("## Drained KV invariant (per-run pre vs post)") + md.append("") + md.append(_render_drained_section(arms_data)) + md.append("") + + out_md = os.path.join(out_dir, "ANALYSIS.md") + with open(out_md, "w") as f: + f.write("\n".join(md) + "\n") + print(out_md) + + +if __name__ == "__main__": + main() diff --git a/scripts/ablation/ssm_sentinel_overlay.patch b/scripts/ablation/ssm_sentinel_overlay.patch new file mode 100644 index 000000000000..005d17eceb02 --- /dev/null +++ b/scripts/ablation/ssm_sentinel_overlay.patch @@ -0,0 +1,248 @@ +diff --git a/vllm/v1/core/single_type_kv_cache_manager.py b/vllm/v1/core/single_type_kv_cache_manager.py +index fa5395685..c2f0663ac 100644 +--- a/vllm/v1/core/single_type_kv_cache_manager.py ++++ b/vllm/v1/core/single_type_kv_cache_manager.py +@@ -1,10 +1,14 @@ + # SPDX-License-Identifier: Apache-2.0 + # SPDX-FileCopyrightText: Copyright contributors to the vLLM project + import itertools ++import json ++import os ++import time + from abc import ABC, abstractmethod + from collections import defaultdict + from collections.abc import Sequence + ++from vllm.logger import init_logger + from vllm.utils.math_utils import cdiv + from vllm.v1.core.block_pool import BlockPool + from vllm.v1.core.kv_cache_utils import ( +@@ -24,6 +28,77 @@ from vllm.v1.kv_cache_interface import ( + ) + from vllm.v1.request import Request + ++logger = init_logger(__name__) ++ ++# --- nvllm KV-zero-for-mamba-ids ablation knob (sentinel-file gated) -------- ++# Env vars get stripped by vLLM EngineCore subprocess spawn ++# (memory:feedback_vllm_enginecore_env_strip), so the gate is a filesystem ++# sentinel bind-mounted :ro into the container. Path is hardcoded; do NOT ++# read from env. Default OFF (no sentinel -> unpatched behavior). ++# ++# Sentinel: /run/nvllm/kv_zero_for_mamba_ids.enabled ++# Effect: relax the new_block_ids gate so MambaSpec allocations also push ++# onto the new_block_ids channel (both in the base manager and in ++# MambaManager.allocate_new_blocks align branch). ++# (Companion SSM zero-on-realloc gate lives in worker/utils.py.) ++from pathlib import Path as _Path ++_KV_ZERO_SENTINEL = _Path("/run/nvllm/kv_zero_for_mamba_ids.enabled") ++_KV_ZERO_CACHED: bool | None = None ++_KV_ZERO_FIRE_COUNT = 0 ++ ++ ++def _kv_zero_for_mamba_ids_enabled() -> bool: ++ """Cache-after-first-check gate. Logs sentinel_check on first call.""" ++ global _KV_ZERO_CACHED ++ if _KV_ZERO_CACHED is None: ++ exists = _KV_ZERO_SENTINEL.exists() ++ _KV_ZERO_CACHED = exists ++ logger.info( ++ "nvllm.ablation.sentinel_check name=kv_zero_for_mamba_ids " ++ "path=%s exists=%s enabled=%s", ++ _KV_ZERO_SENTINEL, exists, exists, ++ ) ++ return _KV_ZERO_CACHED ++ ++ ++def _kv_zero_for_mamba_ids_fired(n_block_ids: int) -> None: ++ """Increment fire counter; log first_fire (count==1) and every 100th.""" ++ global _KV_ZERO_FIRE_COUNT ++ _KV_ZERO_FIRE_COUNT += 1 ++ if _KV_ZERO_FIRE_COUNT == 1: ++ logger.info( ++ "nvllm.ablation.first_fire name=kv_zero_for_mamba_ids " ++ "n_block_ids=%d", ++ n_block_ids, ++ ) ++ elif _KV_ZERO_FIRE_COUNT % 100 == 0: ++ logger.info( ++ "nvllm.ablation.fire_count name=kv_zero_for_mamba_ids count=%d", ++ _KV_ZERO_FIRE_COUNT, ++ ) ++ ++ ++# --- nvllm mamba slot tracer (env-gated JSONL diagnostic) ------------------- ++# NVLLM_MAMBA_SLOT_TRACE=/path/to.jsonl enables JSONL append at alloc_align + ++# free sites; used by the ablation harness to confirm lifecycle events. ++_MAMBA_TRACE_PATH = os.environ.get("NVLLM_MAMBA_SLOT_TRACE", "") ++_MAMBA_TRACE_FH = None ++ ++ ++def _mamba_trace(record: dict) -> None: ++ """Append one JSONL record to NVLLM_MAMBA_SLOT_TRACE (no-op if unset).""" ++ global _MAMBA_TRACE_FH ++ if not _MAMBA_TRACE_PATH: ++ return ++ try: ++ if _MAMBA_TRACE_FH is None: ++ _MAMBA_TRACE_FH = open(_MAMBA_TRACE_PATH, "a", buffering=1) ++ record.setdefault("ts", time.time()) ++ _MAMBA_TRACE_FH.write(json.dumps(record, separators=(",", ":")) + "\n") ++ except Exception: ++ # Tracing must never break serving. ++ pass ++ + + class SingleTypeKVCacheManager(ABC): + """ +@@ -209,8 +284,15 @@ class SingleTypeKVCacheManager(ABC): + cdiv(num_total_computed_tokens, self.block_size) - len(req_blocks) + ) + req_blocks.extend(allocated_blocks) +- if type(self.kv_cache_spec) is FullAttentionSpec: ++ _is_full = type(self.kv_cache_spec) is FullAttentionSpec ++ _is_mamba_gated = ( ++ type(self.kv_cache_spec) is MambaSpec ++ and _kv_zero_for_mamba_ids_enabled() ++ ) ++ if _is_full or _is_mamba_gated: + self.new_block_ids.extend(b.block_id for b in allocated_blocks) ++ if _is_mamba_gated: ++ _kv_zero_for_mamba_ids_fired(len(allocated_blocks)) + + def allocate_new_blocks( + self, request_id: str, num_tokens: int, num_tokens_main_model: int +@@ -237,8 +319,15 @@ class SingleTypeKVCacheManager(ABC): + else: + new_blocks = self.block_pool.get_new_blocks(num_new_blocks) + req_blocks.extend(new_blocks) +- if type(self.kv_cache_spec) is FullAttentionSpec: ++ _is_full = type(self.kv_cache_spec) is FullAttentionSpec ++ _is_mamba_gated = ( ++ type(self.kv_cache_spec) is MambaSpec ++ and _kv_zero_for_mamba_ids_enabled() ++ ) ++ if _is_full or _is_mamba_gated: + self.new_block_ids.extend(b.block_id for b in new_blocks) ++ if _is_mamba_gated: ++ _kv_zero_for_mamba_ids_fired(len(new_blocks)) + return new_blocks + + def take_new_block_ids(self) -> list[int]: +@@ -1005,10 +1094,37 @@ class MambaManager(SingleTypeKVCacheManager): + assert num_new_blocks <= self.num_speculative_blocks + 1 + new_blocks = self.block_pool.get_new_blocks(num_new_blocks) + req_blocks.extend(new_blocks) ++ # Mirror SingleTypeKVCacheManager.allocate_new_blocks: push the ++ # freshly-allocated block IDs onto the new_block_ids channel ++ # so the worker-side zero-on-realloc pass clears recycled ++ # mamba conv_state / ssm_state slots before the next prefill ++ # writes into them. Skip null blocks. Sentinel-gated. ++ if _kv_zero_for_mamba_ids_enabled(): ++ _new_ids = [b.block_id for b in new_blocks if not b.is_null] ++ self.new_block_ids.extend(_new_ids) ++ if _new_ids: ++ _kv_zero_for_mamba_ids_fired(len(_new_ids)) + self._allocated_block_reqs.add(request_id) ++ _mamba_trace({ ++ "op": "alloc_align", ++ "request_id": request_id, ++ "block_ids": [b.block_id for b in new_blocks], ++ }) + return req_blocks[prev_block_len:] + + def free(self, request_id: str) -> None: ++ # Snapshot block IDs for the trace BEFORE super().free() clears them. ++ if _MAMBA_TRACE_PATH: ++ freed_ids = [ ++ b.block_id ++ for b in self.req_to_blocks.get(request_id, ()) ++ if not b.is_null ++ ] ++ _mamba_trace({ ++ "op": "free", ++ "request_id": request_id, ++ "block_ids": freed_ids, ++ }) + if self.mamba_cache_mode == "align": + self._allocated_block_reqs.discard(request_id) + self.last_state_block_idx.pop(request_id, None) +diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py +index 0335d5e0e..270b112d7 100644 +--- a/vllm/v1/worker/utils.py ++++ b/vllm/v1/worker/utils.py +@@ -77,25 +77,49 @@ def _zero_kv_blocks_kernel( + tl.store(ptr + offset + cols, tl.zeros([BLOCK_SIZE], dtype=tl.int32)) + + +-# Ops-visibility counter for the SSM zero-on-realloc pass. Logged at first +-# fire (count==1) and every 100th fire thereafter. Module-global; survives +-# across requests within a worker process. EngineCore strips env vars on +-# subprocess spawn, so this is intentionally not env-gated. ++# --- nvllm SSM zero-on-realloc ablation knob (sentinel-file gated) ---------- ++# Env vars get stripped by vLLM EngineCore subprocess spawn ++# (memory:feedback_vllm_enginecore_env_strip), so the gate is a filesystem ++# sentinel bind-mounted :ro into the container. Path is hardcoded; do NOT ++# read from env. Default OFF (no sentinel -> unpatched behavior). ++# ++# Sentinel: /run/nvllm/zero_ssm_on_realloc.enabled ++# Effect: KVBlockZeroer.zero_block_ids walks the sister MambaBlockZeroer ++# on the same block-ID list. With the sentinel ABSENT the call is ++# skipped entirely (gate returns False). ++from pathlib import Path as _Path ++_SSM_ZERO_SENTINEL = _Path("/run/nvllm/zero_ssm_on_realloc.enabled") ++_SSM_ZERO_CACHED: bool | None = None + _SSM_ZERO_FIRE_COUNT = 0 + + ++def _ssm_zero_on_realloc_enabled() -> bool: ++ """Cache-after-first-check gate. Logs sentinel_check on first call.""" ++ global _SSM_ZERO_CACHED ++ if _SSM_ZERO_CACHED is None: ++ exists = _SSM_ZERO_SENTINEL.exists() ++ _SSM_ZERO_CACHED = exists ++ logger.info( ++ "nvllm.ablation.sentinel_check name=ssm_zero_on_realloc " ++ "path=%s exists=%s enabled=%s", ++ _SSM_ZERO_SENTINEL, exists, exists, ++ ) ++ return _SSM_ZERO_CACHED ++ ++ + def _ssm_zero_fired(n_block_ids: int) -> None: +- """Increment fire counter and log at first fire + every 100th.""" ++ """Increment fire counter; log first_fire (count==1) and every 100th.""" + global _SSM_ZERO_FIRE_COUNT + _SSM_ZERO_FIRE_COUNT += 1 + if _SSM_ZERO_FIRE_COUNT == 1: + logger.info( +- "nvllm.ssm_zero_on_realloc.first_fire n_block_ids=%d", ++ "nvllm.ablation.first_fire name=ssm_zero_on_realloc " ++ "n_block_ids=%d", + n_block_ids, + ) + elif _SSM_ZERO_FIRE_COUNT % 100 == 0: + logger.info( +- "nvllm.ssm_zero_on_realloc.fire_count count=%d", ++ "nvllm.ablation.fire_count name=ssm_zero_on_realloc count=%d", + _SSM_ZERO_FIRE_COUNT, + ) + +@@ -293,9 +317,14 @@ class KVBlockZeroer: + """Zero the KV cache memory for the given block IDs.""" + # Mamba layers are zeroed via PyTorch index_fill_; the sister zeroer + # tolerates the full block-ID list because it filters per-tensor by +- # leading dim. Runs unconditionally so recycled conv_state / ssm_state +- # rows are cleared before the next prefill writes into them. +- if block_ids and self._mamba_zeroer is not None: ++ # leading dim. Sentinel-gated: /run/nvllm/zero_ssm_on_realloc.enabled ++ # present -> on. Without the sentinel the production path is skipped ++ # for ablation A/B vs unpatched baseline. ++ if ( ++ block_ids ++ and self._mamba_zeroer is not None ++ and _ssm_zero_on_realloc_enabled() ++ ): + self._mamba_zeroer.zero_block_ids(block_ids) + _ssm_zero_fired(len(block_ids)) + if not block_ids or self._meta is None: diff --git a/scripts/gsm8k_eval_50.py b/scripts/gsm8k_eval_50.py index 9f239a414488..b0eeb48dac83 100644 --- a/scripts/gsm8k_eval_50.py +++ b/scripts/gsm8k_eval_50.py @@ -2,19 +2,34 @@ GSM8K 50-question random eval against a vLLM server. Reads cached HF gsm8k test parquet (1319 questions), samples N with a fixed -seed (default 50, seed=42 — reproducible), sends each to /v1/completions at +seed (default 50, seed=42 - reproducible), sends each to /v1/completions at temperature=0, parses final numeric answer. Per memory:feedback_eval_completions: /v1/completions, NOT /v1/chat/completions. +Instrumented form (2026-05-15) for the SSM zero-on-realloc ablation suite: + - per-question JSONL trace at /perq.jsonl (one record per Q) + - --run-index flag, stamped into every per-Q record + - usage tokens (prompt/completion/total) + decode_tok/s + finish_reason + - output sha256 (16 hex), request id, character count + - --metrics-url flag: snapshots vllm:* prometheus metrics at pre/q10/q20/ + q30/q40/q50/post tags, saved to /metrics_.json + +Timing semantics PRESERVED: wall_time_s is the time from the +requests.post() start to response received, EXCLUDING metrics-snapshot time. + Usage: .venv/bin/python scripts/gsm8k_eval_50.py \\ --api http://localhost:8000/v1 --model default \\ - --n 50 --save out.json --label some_run_name + --n 50 --save out.json --label some_run_name \\ + --run-index 1 \\ + --metrics-url http://localhost:8000/metrics """ import argparse +import hashlib import json +import os import re import sys import time @@ -33,9 +48,17 @@ "ee7b8da9e381df27b9e3f7758a159ab2bdaa4dbaa910546cbbc47e0cb44e4f59" ) +# Subset of /metrics lines we extract into the per-snapshot JSON. +METRICS_KEYS = ( + "vllm:num_requests_running", + "vllm:num_requests_waiting", + "vllm:kv_cache_usage_perc", + "vllm:generation_tokens_total", + "vllm:num_preemptions_total", +) + def _load_test_split(): - import os if os.path.exists(GSM8K_TEST_ARROW): with pa.memory_map(GSM8K_TEST_ARROW, "rb") as src: return ipc.open_stream(src).read_all().to_pylist() @@ -65,6 +88,60 @@ def normalize(s: str) -> str: return s.strip() +def _snapshot_metrics(metrics_url, tag, perq_dir): + """Fetch /metrics, extract METRICS_KEYS, write metrics_.json. + + Best-effort: failures never abort the eval; returns a dict on success + or None on failure. Called outside the wall_time_s timer. + """ + if not metrics_url or not perq_dir: + return None + try: + r = requests.get(metrics_url, timeout=10) + r.raise_for_status() + body = r.text + except Exception as e: + snap = {"tag": tag, "ts": time.time(), "error": repr(e)} + try: + with open(os.path.join(perq_dir, f"metrics_{tag}.json"), "w") as f: + json.dump(snap, f, indent=2) + except Exception: + pass + return snap + + # Prometheus text format: each line starts with the metric name + # (possibly with {labels}) and a value. We extract the LAST numeric + # value seen for each desired key (sum across labels for gauges or + # final total for counters; both behaviors are acceptable here since + # we mostly care about deltas between snapshots). + values: dict = {} + for line in body.splitlines(): + if not line or line.startswith("#"): + continue + # Match "metric_name" or "metric_name{labels}" + for key in METRICS_KEYS: + if line.startswith(key + " ") or line.startswith(key + "{"): + # split on whitespace from the right: "{...} " + parts = line.rsplit(None, 1) + if len(parts) != 2: + continue + try: + v = float(parts[1]) + except ValueError: + continue + # Sum across label sets (gauges total across engines; counters + # already monotonic, so summing engine_idx labels is correct). + values[key] = values.get(key, 0.0) + v + break + snap = {"tag": tag, "ts": time.time(), "metrics": values} + try: + with open(os.path.join(perq_dir, f"metrics_{tag}.json"), "w") as f: + json.dump(snap, f, indent=2) + except Exception: + pass + return snap + + def main(): ap = argparse.ArgumentParser() ap.add_argument("--api", default="http://localhost:8000/v1") @@ -75,14 +152,39 @@ def main(): ap.add_argument("--timeout", type=int, default=180) ap.add_argument("--label", default="gsm8k_50") ap.add_argument("--save", default=None) + # Instrumented additions: + ap.add_argument( + "--run-index", type=int, default=0, + help="Soak run index, stamped into every per-Q JSONL record", + ) + ap.add_argument( + "--metrics-url", default=None, + help="If set, snapshot /metrics pre / q10..q50 / post into " + "/metrics_.json (timing NOT charged to wall).", + ) args = ap.parse_args() + # perq_dir = directory holding gsm8k.json (i.e. ) + perq_dir = None + perq_fh = None + if args.save: + perq_dir = os.path.dirname(os.path.abspath(args.save)) or "." + try: + os.makedirs(perq_dir, exist_ok=True) + perq_fh = open(os.path.join(perq_dir, "perq.jsonl"), "a", buffering=1) + except Exception as e: + sys.stderr.write(f"WARN: cannot open perq.jsonl: {e}\n") + perq_fh = None + table = _load_test_split() import random rng = random.Random(args.seed) sample = rng.sample(table, args.n) + # Pre-flight metrics snapshot (NOT charged to any question's wall time). + _snapshot_metrics(args.metrics_url, "pre", perq_dir) + results = [] correct = 0 errors = 0 @@ -99,13 +201,25 @@ def main(): "temperature": 0.0, "stop": ["\nQ:", "\nQuestion:"], } + + # Per-question instrumentation defaults (filled in on success). + usage = {} + finish_reason = None + request_id = None + text = "" + ts = time.time() try: r = requests.post( f"{args.api}/completions", json=body, timeout=args.timeout ) r.raise_for_status() - text = r.json()["choices"][0]["text"] + payload = r.json() + choice0 = payload.get("choices", [{}])[0] + text = choice0.get("text", "") + finish_reason = choice0.get("finish_reason") + usage = payload.get("usage", {}) or {} + request_id = payload.get("id") pred = normalize(extract_predicted(text)) ok = (pred == gold) status = "OK" if ok else "WRONG" @@ -116,25 +230,91 @@ def main(): pred = "" status = "ERROR" errors += 1 - elapsed = time.time() - ts + ok = False + wall_time_s = time.time() - ts + + # Per-Q instrumentation record. + prompt_tokens = usage.get("prompt_tokens", 0) if isinstance(usage, dict) else 0 + completion_tokens = ( + usage.get("completion_tokens", 0) if isinstance(usage, dict) else 0 + ) + total_tokens = usage.get("total_tokens", 0) if isinstance(usage, dict) else 0 + decode_tok_s = ( + (completion_tokens / wall_time_s) + if (completion_tokens and wall_time_s > 0) else 0.0 + ) + output_len = len(text) if isinstance(text, str) else 0 + try: + output_sha256 = hashlib.sha256( + (text if isinstance(text, str) else "").encode("utf-8", "replace") + ).hexdigest()[:16] + except Exception: + output_sha256 = "" + + perq_rec = { + "label": args.label, + "run_index": args.run_index, + "prompt_index": i + 1, # 1-based, matches "[N/50]" log format + "wall_time_s": round(wall_time_s, 4), + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "decode_tok_s": round(decode_tok_s, 3), + "finish_reason": finish_reason, + "gold": gold, + "pred": pred, + "correct": bool(ok), + "output_len": output_len, + "output_sha256": output_sha256, + "request_id": request_id, + "ts": ts, + } + if perq_fh is not None: + try: + perq_fh.write(json.dumps(perq_rec, separators=(",", ":")) + "\n") + except Exception: + pass + # Aggregate JSON results (same shape as before; do not break callers). results.append({ "i": i, "expected": gold, "got": pred, "status": status, - "elapsed": round(elapsed, 1), + "elapsed": round(wall_time_s, 1), "raw_tail": text[-200:] if isinstance(text, str) else "", "question": q[:80] + "..." if len(q) > 80 else q, + # New non-breaking fields (additive, do not alter existing keys): + "wall_time_s": round(wall_time_s, 4), + "prompt_tokens": prompt_tokens, + "completion_tokens": completion_tokens, + "total_tokens": total_tokens, + "decode_tok_s": round(decode_tok_s, 3), + "finish_reason": finish_reason, + "output_len": output_len, + "output_sha256": output_sha256, + "request_id": request_id, }) - # progress on stderr + # progress on stderr (preserve existing format for log greppability) sys.stderr.write( - f"[{i + 1}/{args.n}] {status} (gold={gold} pred={pred}) {elapsed:.1f}s\n" + f"[{i + 1}/{args.n}] {status} (gold={gold} pred={pred}) " + f"{wall_time_s:.1f}s ct={completion_tokens} dtok/s={decode_tok_s:.2f} " + f"fr={finish_reason}\n" ) sys.stderr.flush() + # Mid-eval metrics snapshots (after Q10/20/30/40/50). Done AFTER + # wall_time_s is recorded, so snapshot cost is never charged to a + # question's decode latency. + if (i + 1) in (10, 20, 30, 40, 50): + _snapshot_metrics(args.metrics_url, f"q{i + 1}", perq_dir) + total_t = time.time() - t0 + + # Post-eval snapshot (NOT charged to wall). + _snapshot_metrics(args.metrics_url, "post", perq_dir) + out = { "label": args.label, "model": args.model, @@ -146,10 +326,17 @@ def main(): "accuracy": f"{correct}/{args.n} ({100*correct/args.n:.1f}%)", "total_seconds": round(total_t, 1), "results": results, + # Additive aggregate fields (won't break existing parsers). + "run_index": args.run_index, } if args.save: with open(args.save, "w") as f: json.dump(out, f, indent=2) + if perq_fh is not None: + try: + perq_fh.close() + except Exception: + pass print(json.dumps({k: v for k, v in out.items() if k != "results"}, indent=2)) diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 928d66bccbba..61e247747395 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -1031,8 +1031,11 @@ def _init_kv_zero_meta(self) -> None: Called from gpu_worker.py outside the CuMem pool context. """ self._kv_block_zeroer = KVBlockZeroer(self.device, self.pin_memory) + # Materialize the attn-groups iterator so KVBlockZeroer can walk it + # twice (once for full-attn segments, once for Mamba state tensors). + attn_groups_list = list(self._kv_cache_spec_attn_group_iterator()) self._kv_block_zeroer.init_meta( - attn_groups_iter=self._kv_cache_spec_attn_group_iterator(), + attn_groups_iter=attn_groups_list, kernel_block_sizes=self._kernel_block_sizes, cache_dtype=self.cache_config.cache_dtype, runner_only_attn_layers=self.runner_only_attn_layers, diff --git a/vllm/v1/worker/utils.py b/vllm/v1/worker/utils.py index 83fc12cb5c3b..0335d5e0e4d4 100644 --- a/vllm/v1/worker/utils.py +++ b/vllm/v1/worker/utils.py @@ -77,6 +77,95 @@ def _zero_kv_blocks_kernel( tl.store(ptr + offset + cols, tl.zeros([BLOCK_SIZE], dtype=tl.int32)) +# Ops-visibility counter for the SSM zero-on-realloc pass. Logged at first +# fire (count==1) and every 100th fire thereafter. Module-global; survives +# across requests within a worker process. EngineCore strips env vars on +# subprocess spawn, so this is intentionally not env-gated. +_SSM_ZERO_FIRE_COUNT = 0 + + +def _ssm_zero_fired(n_block_ids: int) -> None: + """Increment fire counter and log at first fire + every 100th.""" + global _SSM_ZERO_FIRE_COUNT + _SSM_ZERO_FIRE_COUNT += 1 + if _SSM_ZERO_FIRE_COUNT == 1: + logger.info( + "nvllm.ssm_zero_on_realloc.first_fire n_block_ids=%d", + n_block_ids, + ) + elif _SSM_ZERO_FIRE_COUNT % 100 == 0: + logger.info( + "nvllm.ssm_zero_on_realloc.fire_count count=%d", + _SSM_ZERO_FIRE_COUNT, + ) + + +class MambaBlockZeroer: + """Zeroes Mamba conv_state / ssm_state rows for given block IDs. + + Mamba state tensors have a per-layer leading "block" dim equal to the + number of blocks; row ``[block_id]`` is one block of state. The full-attn + KVBlockZeroer Triton kernel assumes a uniform page size across all + registered segments, which does not hold once Mamba layers are mixed in + (conv vs ssm vs attn page sizes all differ). This zeroer instead uses + PyTorch index-assignment per registered state tensor: simple, idempotent, + and called only at request-free / block-realloc time (not in the hot + decode path). + """ + + def __init__(self, device: torch.device, pin_memory: bool): + self.device = device + self.pin_memory = pin_memory + self._tensors: list[torch.Tensor] = [] + + def init_meta( + self, + attn_groups_iter: Iterable["AttentionGroup"], + static_forward_context: dict[str, Any], + ) -> None: + seen: set[int] = set() + for group in attn_groups_iter: + spec = group.kv_cache_spec + if not isinstance(spec, MambaSpec): + continue + for layer_name in group.layer_names: + layer = static_forward_context.get(layer_name, None) + if layer is None: + continue + kv = getattr(layer, "kv_cache", None) + # Mamba layers bind kv_cache as a list/tuple of state tensors + # (conv_state, ssm_state, ...). Each tensor's leading dim is + # num_blocks. + if not isinstance(kv, (list, tuple)): + continue + for state in kv: + if not isinstance(state, torch.Tensor): + continue + key = state.data_ptr() + if key in seen: + continue + seen.add(key) + self._tensors.append(state) + + def zero_block_ids(self, block_ids: list[int]) -> None: + if not block_ids or not self._tensors: + return + # Same-step block_ids are tiny (<= max_num_seqs), so a fresh tensor is + # cheaper than a pinned buffer. Hybrid configs use a single block-id + # space (uniform page-size padding), so IDs from any per-spec manager + # are valid for every per-spec tensor; we defensively filter to each + # tensor's leading dim to survive future config drift. + idx_cpu = torch.tensor(block_ids, dtype=torch.long) + for state in self._tensors: + n = state.shape[0] + mask = (idx_cpu >= 0) & (idx_cpu < n) + local_ids = idx_cpu[mask] if not mask.all() else idx_cpu + if local_ids.numel() == 0: + continue + idx_gpu = local_ids.to(device=self.device, non_blocking=True) + state.index_fill_(0, idx_gpu, 0) + + class KVBlockZeroer: """Manages efficient zeroing of KV cache blocks via a Triton kernel. @@ -92,6 +181,7 @@ def __init__(self, device: torch.device, pin_memory: bool): self._id_cap: int = 0 self._ids_pinned: torch.Tensor | None = None self._ids_gpu: torch.Tensor | None = None + self._mamba_zeroer: MambaBlockZeroer | None = None def init_meta( self, @@ -112,8 +202,21 @@ def init_meta( PAGE_SIZE_EL accounts for this ratio so that ``block_id * PAGE_SIZE_EL`` lands at the correct offset. - Only AttentionSpec layers are processed; Mamba layers are skipped. + Full-attention layers go through the Triton zeroing kernel below. + Mamba layers are handed off to MambaBlockZeroer for a per-tensor + PyTorch ``index_fill_`` because the conv / ssm page sizes differ from + the full-attn page size and cannot share PAGE_SIZE_EL. """ + # Materialize the iterator so it can be walked twice (full-attn here + + # mamba in MambaBlockZeroer.init_meta below). gpu_model_runner.py + # already passes a list; this is defensive against other callers. + if not isinstance(attn_groups_iter, (list, tuple)): + attn_groups_iter = list(attn_groups_iter) + + # Set up sister zeroer for Mamba layers BEFORE the full-attn walk. + self._mamba_zeroer = MambaBlockZeroer(self.device, self.pin_memory) + self._mamba_zeroer.init_meta(attn_groups_iter, static_forward_context) + seen_ptrs: set[int] = set() seg_addrs: list[int] = [] page_size_el: int | None = None @@ -188,6 +291,13 @@ def init_meta( def zero_block_ids(self, block_ids: list[int]) -> None: """Zero the KV cache memory for the given block IDs.""" + # Mamba layers are zeroed via PyTorch index_fill_; the sister zeroer + # tolerates the full block-ID list because it filters per-tensor by + # leading dim. Runs unconditionally so recycled conv_state / ssm_state + # rows are cleared before the next prefill writes into them. + if block_ids and self._mamba_zeroer is not None: + self._mamba_zeroer.zero_block_ids(block_ids) + _ssm_zero_fired(len(block_ids)) if not block_ids or self._meta is None: return seg_addrs, page_size_el, blk_size, n_segs = self._meta