OCWC22 · OCWC22 · Apr 17, 2026 · Copilot · Apr 17, 2026 · Copilot
diff --git a/.github/configs/isb1-mechanism-baseline.yaml b/.github/configs/isb1-mechanism-baseline.yaml
@@ -0,0 +1,59 @@
+# ISB1 mechanism_eval — baseline (no compression, no speculative decoding).
+#
+# These rows anchor the mechanism-axis Pareto frontier: every other mechanism
+# variant is compared back to a baseline row with the same model × hardware ×
+# context band. Baseline rows declare mechanism=baseline / mechanism_variant=none
+# so they pass the mechanism_compression_quality gate without requiring a
+# quality_eval_id. The gate only enforces a completed quality eval for
+# supported-tier rows whose mechanism is in the compression set
+# (kv_quantization, kv_compression, compressed_attention).
+#
+# All cells here are benchmark_certification_status=dataset_replay_verified.
+# No live-serving certification is claimed.
+
+dsr1-fp8-h100-isb1-mechanism-baseline-vllm:
+  image: vllm/vllm-openai:v0.11.0
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: deepseek_r1_0528
+  mechanism: baseline
+  mechanism-variant: none
+  replay-configs:
+    - export-file: datasets/isb1/exports/core/code_8k1k.json
+      request-mode: multi-turn
+      support-status: supported
+      search-space:
+        - max-concurrency: 2
+          max-sessions: 2
+          max-turns-per-session: 3
+          num-warmup-sessions: 0
+
+qwen3.5-fp8-b200-isb1-mechanism-baseline-sglang:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: sglang
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: qwen3_5_397b_a17b
+  mechanism: baseline
+  mechanism-variant: none
+  max-model-len: 131072
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          max-sessions: 2
+          max-turns-per-session: 3
+          num-warmup-sessions: 0
diff --git a/.github/configs/isb1-mechanism-fp8-kv.yaml b/.github/configs/isb1-mechanism-fp8-kv.yaml
@@ -0,0 +1,76 @@
+# ISB1 mechanism_eval — FP8 KV quantization.
+#
+# Exercises the engine-native FP8 KV cache path (vLLM --kv-cache-dtype fp8,
+# SGLang --kv-cache-dtype fp8_e4m3). Cells here ship with:
+#   mechanism: kv_quantization
+#   mechanism_variant: fp8_e4m3
+#   compression_method: fp8_e4m3
+#   compression_scope: kv_cache
+#   quality_eval_id: ruler_v1        ← registered harness
+#   quality_eval_status: pending     ← must become "completed" before
+#                                      support_status can move to "supported"
+#
+# Gate rule enforced by utils/gate_isb1.py mechanism_compression_quality:
+#   support_status == "supported" AND mechanism in compression set
+#   ⇒ quality_eval_status == "completed"
+#
+# Until the referenced RULER run lands, these rows stay at
+# support_status=reviewed_preview so the hard gate passes. Moving the row to
+# "supported" without filling the quality delta will fail the gate.
+
+dsr1-fp8-h100-isb1-mechanism-fp8-kv-vllm:
+  image: vllm/vllm-openai:v0.11.0
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  precision: fp8
+  framework: vllm
+  runner: h100
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:vllm
+  hardware-profile-id: nvidia:h100_sxm_80gb
+  canonical-model-id: deepseek_r1_0528
+  mechanism: kv_quantization
+  mechanism-variant: fp8_e4m3
+  compression-method: fp8_e4m3
+  compression-scope: kv_cache
+  quality-eval-id: ruler_v1
+  quality-eval-status: pending
+  kv-cache-dtype: fp8
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_131k/code_131k1k.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          max-sessions: 2
+          max-turns-per-session: 3
+          num-warmup-sessions: 0
+
+qwen3.5-fp8-b200-isb1-mechanism-fp8-kv-sglang:
+  image: lmsysorg/sglang:v0.5.9-cu130
+  model: Qwen/Qwen3.5-397B-A17B-FP8
+  model-prefix: qwen3.5
+  precision: fp8
+  framework: sglang
+  runner: b200
+  benchmark-type: isb1_replay
+  runtime-stack-id: standalone:sglang
+  hardware-profile-id: nvidia:b200_sxm_180gb
+  canonical-model-id: qwen3_5_397b_a17b
+  mechanism: kv_quantization
+  mechanism-variant: fp8_e4m3
+  compression-method: fp8_e4m3
+  compression-scope: kv_cache
+  quality-eval-id: ruler_v1
+  quality-eval-status: pending
+  kv-cache-dtype: fp8_e4m3
+  max-model-len: 131072
+  replay-configs:
+    - export-file: datasets/isb1/exports/extension_131k/code_131k1k_qwen3.5.json
+      request-mode: multi-turn
+      support-status: reviewed_preview
+      search-space:
+        - max-concurrency: 2
+          max-sessions: 2
+          max-turns-per-session: 3
+          num-warmup-sessions: 0
diff --git a/.github/workflows/run-isb1-mechanism-eval.yml b/.github/workflows/run-isb1-mechanism-eval.yml
@@ -0,0 +1,120 @@
+name: Run ISB1 Mechanism Eval Sweep
+run-name: ISB1 Mechanism Eval - ${{ github.event.inputs.config-file || '.github/configs/isb1-mechanism-baseline.yaml' }}
+
+# Dispatches ISB1 replay rows with mechanism_eval metadata attached.
+# The config files declare mechanism/mechanism_variant/quality_eval_id etc.
+# at the top level; utils/matrix_logic/generate_sweep_configs.py plumbs them
+# through as env vars read by utils/process_result_isb1.py.
+#
+# gate_isb1.py runs its mechanism_compression_quality gate against the
+# aggregated result set: any supported-tier compression row without a
+# completed, registered quality eval fails the gate.
+
+on:
+  workflow_dispatch:
+    inputs:
+      config-file:
+        description: ISB1 mechanism_eval config file path
+        required: true
+        default: .github/configs/isb1-mechanism-baseline.yaml
+      runner-type:
+        description: Optional space-separated runner filters (e.g. h200 b200)
+        required: false
+        default: ''
+      runner-config:
+        description: Runner config YAML
+        required: false
+        default: .github/configs/runners.yaml
+      ref:
+        description: Git ref to checkout
+        required: false
+        default: ''
+
+jobs:
+  setup:
+    runs-on: ubuntu-latest
+    outputs:
+      mechanism-matrix: ${{ steps.generate.outputs.mechanism-matrix }}
+      has-matrix: ${{ steps.generate.outputs.has-matrix }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # v6.0.2
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Install dependencies
+        run: pip install pydantic pyyaml
+
+      - id: generate
+        env:
+          CONFIG_FILE: ${{ inputs.config-file }}
+          RUNNER_CONFIG: ${{ inputs.runner-config }}
+          RUNNER_TYPE: ${{ inputs.runner-type }}
+        run: |
+          if [ ! -f "$CONFIG_FILE" ]; then
+            echo "Missing ISB1 mechanism_eval config file: $CONFIG_FILE" >&2
+            exit 1
+          fi
+
+          cmd=(python3 utils/matrix_logic/generate_sweep_configs.py isb1-sweep --config-files "$CONFIG_FILE" --runner-config "$RUNNER_CONFIG")
+
+          if [ -n "$RUNNER_TYPE" ]; then
+            read -r -a runner_types <<< "$RUNNER_TYPE"
+            cmd+=(--runner-type "${runner_types[@]}")
+          fi
+
+          matrix_json="$("${cmd[@]}")"
+          compact_matrix="$(printf '%s' "$matrix_json" | python3 -c 'import json,sys; print(json.dumps(json.load(sys.stdin)))')"
+          has_matrix="$(printf '%s' "$compact_matrix" | python3 -c 'import json,sys; print("true" if json.load(sys.stdin) else "false")')"
+
+          {
+            echo "mechanism-matrix=$compact_matrix"
+            echo "has-matrix=$has_matrix"
+          } >> "$GITHUB_OUTPUT"
+
+  sweep:
+    needs: setup
+    if: ${{ needs.setup.outputs.has-matrix == 'true' }}
+    uses: ./.github/workflows/benchmark-isb1-tmpl.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        config: ${{ fromJson(needs.setup.outputs.mechanism-matrix) }}
+    secrets: inherit
+    with:
+      runner: ${{ matrix.config.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      model-prefix: ${{ matrix.config.model-prefix }}
+      precision: ${{ matrix.config.precision }}
+      framework: ${{ matrix.config.framework }}
+      exp-name: ${{ matrix.config.exp-name }}
+      benchmark-type: ${{ matrix.config.benchmark-type }}
+      export-file: ${{ matrix.config.export-file }}
+      runtime-stack-id: ${{ matrix.config.runtime-stack-id }}
+      hardware-profile-id: ${{ matrix.config.hardware-profile-id }}
+      canonical-model-id: ${{ matrix.config.canonical-model-id }}
+      support-status: ${{ matrix.config.support-status || '' }}
+      request-mode: ${{ matrix.config.request-mode }}
+      max-concurrency: ${{ matrix.config.max-concurrency }}
+      max-sessions: ${{ matrix.config.max-sessions || '' }}
+      max-turns-per-session: ${{ matrix.config.max-turns-per-session || '' }}
+      max-model-len: ${{ matrix.config.max-model-len || '' }}
+      tp-override: ${{ matrix.config.tp || '' }}
+      ep-override: ${{ matrix.config.ep || '' }}
+      trace-source: ${{ matrix.config.trace-source || '' }}
+      offload-mode: ${{ matrix.config.offload-mode || '' }}
+      kv-cache-dtype: ${{ matrix.config.kv-cache-dtype || '' }}
+      disable-prefix-caching: ${{ matrix.config.disable-prefix-caching || '' }}
+      workload-type: ${{ matrix.config.workload-type || '' }}
-      workload-type: ${{ matrix.config.workload-type || '' }}
+      workload-type: ${{ matrix.config.workload-type || '' }}
+      mechanism: ${{ matrix.config.mechanism || '' }}
+      mechanism-variant: ${{ matrix.config.mechanism-variant || '' }}
+      quality-eval-id: ${{ matrix.config.quality-eval-id || '' }}
+      quality-eval-status: ${{ matrix.config.quality-eval-status || '' }}
+      draft-model: ${{ matrix.config.draft-model || '' }}
+      draft-model-prefix: ${{ matrix.config.draft-model-prefix || '' }}
+      speculative-model: ${{ matrix.config.speculative-model || '' }}
+      speculative-model-prefix: ${{ matrix.config.speculative-model-prefix || '' }}
-      workload-type: ${{ matrix.config.workload-type || '' }}
+      workload-type: ${{ matrix.config.workload-type || '' }}
+      mechanism: ${{ matrix.config.mechanism || '' }}
+      mechanism-variant: ${{ matrix.config.mechanism-variant || '' }}
+      quality-eval-id: ${{ matrix.config.quality-eval-id || '' }}
+      quality-eval-status: ${{ matrix.config.quality-eval-status || '' }}
+      draft-model: ${{ matrix.config.draft-model || '' }}
+      draft-model-prefix: ${{ matrix.config.draft-model-prefix || '' }}
+      speculative-model: ${{ matrix.config.speculative-model || '' }}
+      speculative-model-prefix: ${{ matrix.config.speculative-model-prefix || '' }}
+      ref: ${{ inputs.ref || github.ref }}
+
+  collect-results:
+    needs: [setup, sweep]
+    if: ${{ always() && needs.setup.outputs.has-matrix == 'true' && needs.sweep.result != 'skipped' }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      result-prefix: isb1-mechanism
diff --git a/datasets/isb1/registry/mechanism_variant_registry.json b/datasets/isb1/registry/mechanism_variant_registry.json
@@ -0,0 +1,77 @@
+{
+  "schema_version": "1.0.0",
+  "description": "Registered mechanism × variant pairs for ISB1 mechanism_eval. Rows whose mechanism/mechanism_variant do not appear here are flagged unregistered by utils/mechanism_eval.py and must not be cited as certified.",
+  "compression_mechanisms": [
+    "kv_quantization",
+    "kv_compression",
+    "compressed_attention"
+  ],
+  "speculative_mechanisms": [
+    "speculative_decoding"
+  ],
+  "variants": [
+    {
+      "mechanism": "baseline",
+      "mechanism_variant": "none",
+      "compression_method": null,
+      "compression_scope": null,
+      "description": "No mechanism applied. Used as the reference point for all other mechanism rows."
+    },
+    {
+      "mechanism": "kv_quantization",
+      "mechanism_variant": "fp8_e4m3",
+      "compression_method": "fp8_e4m3",
+      "compression_scope": "kv_cache",
+      "description": "Per-tensor FP8 E4M3 KV cache quantization. Engine-native path (vLLM --kv-cache-dtype fp8, SGLang --kv-cache-dtype fp8_e4m3)."
+    },
+    {
+      "mechanism": "kv_quantization",
+      "mechanism_variant": "turboquant_class",
+      "compression_method": "turboquant_class",
+      "compression_scope": "kv_cache",
+      "description": "TurboQuant-class Hadamard-rotated 4-bit KV quantization. Requires non-null quality_eval_id to be cited at supported tier."
+    },
+    {
+      "mechanism": "kv_compression",
+      "mechanism_variant": "kvtc_class",
+      "compression_method": "kvtc_class",
+      "compression_scope": "kv_cache",
+      "description": "KVTC-class tensor-codebook KV compression. Requires non-null quality_eval_id to be cited at supported tier."
+    },
+    {
+      "mechanism": "compressed_attention",
+      "mechanism_variant": "triattention_class",
+      "compression_method": "triattention_class",
+      "compression_scope": "attention",
+      "description": "TriAttention-class sparse-attention variant. Requires non-null quality_eval_id to be cited at supported tier."
+    },
+    {
+      "mechanism": "speculative_decoding",
+      "mechanism_variant": "mtp",
+      "compression_method": null,
+      "compression_scope": null,
+      "description": "Multi-token prediction head as draft model. Requires draft_model_id and speculative_acceptance_rate."
+    },
+    {
+      "mechanism": "speculative_decoding",
+      "mechanism_variant": "eagle3",
+      "compression_method": null,
+      "compression_scope": null,
+      "description": "EAGLE-3 speculative decoding. Requires draft_model_id and speculative_acceptance_rate."
+    },
+    {
+      "mechanism": "speculative_decoding",
+      "mechanism_variant": "medusa",
+      "compression_method": null,
+      "compression_scope": null,
+      "description": "Medusa speculative decoding. Requires draft_model_id and speculative_acceptance_rate."
+    },
+    {
+      "mechanism": "speculative_decoding",
+      "mechanism_variant": "dflash",
+      "compression_method": null,
+      "compression_scope": null,
+      "description": "DeepFlash-style draft stack. Requires draft_model_id and speculative_acceptance_rate."
+    }
+  ]
+}
diff --git a/datasets/isb1/registry/quality_eval_registry.json b/datasets/isb1/registry/quality_eval_registry.json
@@ -0,0 +1,42 @@
+{
+  "schema_version": "1.0.0",
+  "description": "Registered quality-eval harnesses for ISB1 mechanism_eval. A row asserting quality_eval_id must reference one of these; gate_isb1 requires a completed eval before any compression mechanism can claim support_status=supported.",
+  "eval_harnesses": [
+    {
+      "quality_eval_id": "ruler_v1",
+      "harness": "RULER",
+      "version": "v1.0",
+      "scope": "long_context_retrieval",
+      "metric_keys": ["ruler_avg_score", "ruler_per_length"],
+      "baseline_required": true,
+      "description": "Long-context retrieval benchmark. Primary signal for KV quantization and compression quality at 32k–1M."
+    },
+    {
+      "quality_eval_id": "longbench_v2",
+      "harness": "LongBench",
+      "version": "v2.0",
+      "scope": "long_context_reasoning",
+      "metric_keys": ["longbench_avg_f1", "longbench_per_task"],
+      "baseline_required": true,
+      "description": "Long-context reasoning and multi-doc QA. Complements RULER for reasoning-heavy workloads."
+    },
+    {
+      "quality_eval_id": "humaneval",
+      "harness": "HumanEval",
+      "version": "v1.0",
+      "scope": "code_generation",
+      "metric_keys": ["humaneval_pass_at_1", "humaneval_pass_at_10"],
+      "baseline_required": true,
+      "description": "Code-generation accuracy. Primary signal for coding workloads under compression."
+    },
+    {
+      "quality_eval_id": "math_500",
+      "harness": "MATH-500",
+      "version": "v1.0",
+      "scope": "reasoning_math",
+      "metric_keys": ["math_500_accuracy"],
+      "baseline_required": true,
+      "description": "Math reasoning accuracy. Detects chain-of-thought degradation from aggressive KV quantization."
+    }
+  ]
+}