SemiAnalysisAI · cquil11 · Dec 15, 2025 · Dec 10, 2025 · Dec 10, 2025 · Dec 11, 2025
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -170,5 +170,5 @@ jobs:
       - name: Upload results
         uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
-          name: ${{ env.RESULT_FILENAME }}
+          name: bmk_${{ env.RESULT_FILENAME }}
           path: agg_${{ env.RESULT_FILENAME }}_*.json
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -169,5 +169,5 @@ jobs:
       - name: Upload result
         uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
-          name: ${{ env.RESULT_FILENAME }}
+          name: bmk_${{ env.RESULT_FILENAME }}
           path: agg_${{ env.RESULT_FILENAME }}.json
diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml
@@ -3,7 +3,7 @@ name: Template - Collect Results
 on:
   workflow_call:
     inputs:
-      exp-name:
+      result-prefix:
         required: false
         type: string
         default: ''
@@ -26,18 +26,18 @@ jobs:
         uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
         with:
           path: results/
-          pattern: ${{ inputs.exp-name && format('{0}_*', inputs.exp-name) || '*' }}
+          pattern: ${{ inputs.result-prefix && format('{0}_*', inputs.result-prefix) || '*' }}
 
       - name: Print summary
         run: |
           pip install tabulate
           python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY
 
       - name: Aggregate results
-        run: python3 utils/collect_results.py results/ ${{ inputs.exp-name || 'all' }}
+        run: python3 utils/collect_results.py results/ ${{ inputs.result-prefix || 'all' }}
 
       - name: Upload aggregated results
         uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
         with:
-          name: results_${{ inputs.exp-name || 'all' }}
-          path: agg_${{ inputs.exp-name || 'all' }}.json
+          name: results_${{ inputs.result-prefix || 'all' }}
+          path: agg_${{ inputs.result-prefix || 'all' }}.json
diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml
@@ -2,8 +2,6 @@ name: "Full Sweep Scheduler - 1k1k"
 
 on:
     workflow_dispatch:
-    schedule:
-        - cron: "0 0 * * *"
 
 jobs:
     get-dsr1-configs:

diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml
@@ -2,8 +2,6 @@ name: "Full Sweep Scheduler - 1k8k"
 
 on:
     workflow_dispatch:
-    schedule:
-        - cron: "0 0 * * *"
 
 jobs:
     get-dsr1-configs:

diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml
@@ -2,8 +2,6 @@ name: "Full Sweep Scheduler - 8k1k"
 
 on:
     workflow_dispatch:
-    schedule:
-        - cron: "0 0 * * *"
 
 jobs:
     get-dsr1-configs:

diff --git a/.github/workflows/run-sweep.yml b/.github/workflows/run-sweep.yml
@@ -0,0 +1,235 @@
+name: "Run Sweep"
+run-name: Run Sweep - ${{ github.event.pull_request.title || github.ref_name }}
+
+concurrency:
+    group: sweep-${{ github.event.pull_request.number || github.sha }}
+    cancel-in-progress: true
+
+on:
+    push:
+        branches:
+            - main
+        paths:
+            - "perf-changelog.yaml"
+    pull_request:
+        branches:
+            - main
+        types:
+            - ready_for_review
+            - synchronize
+            - labeled
+        paths:
+            - "perf-changelog.yaml"
+
+jobs:
+    setup:
+        runs-on: ubuntu-latest
+        if: >-
+            (github.event_name == 'pull_request' && !github.event.pull_request.draft && contains(github.event.pull_request.labels.*.name, 'sweep-enabled')) ||
+            (github.event_name != 'pull_request' && !contains(github.event.head_commit.message, '[skip-sweep]'))
+        outputs:
+            search-space-config: ${{ steps.setup.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+              with:
+                  fetch-depth: 0
+
+            - id: setup
+              run: |
+                  pip install pydantic
+
+                  if [ "${{ github.event_name }}" == "pull_request" ]; then
+                      BASE_REF="origin/${{ github.base_ref }}"
+                      HEAD_REF="${{ github.event.pull_request.head.sha }}"
+                  else
+                      BASE_REF="${{ github.event.before }}"
+                      HEAD_REF="${{ github.event.after }}"
+                  fi
+
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/process_changelog.py \
+                      --changelog-file ${GITHUB_WORKSPACE}/perf-changelog.yaml \
+                      --base-ref "$BASE_REF" \
+                      --head-ref "$HEAD_REF")
+
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+    sweep-multi-node-1k1k:
+        needs: setup
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k']) != 'null' }}
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: multi-node 1k1k /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['1k1k'] }}
+        secrets: inherit
+        with: &multi-node-inputs
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            exp-name: ${{ matrix.config.exp-name }}
+            conc-list: ${{ toJson(matrix.config.conc) }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+
+            prefill-num-worker: ${{ matrix.config.prefill.num-worker }}
+            prefill-tp: ${{ matrix.config.prefill.tp }}
+            prefill-ep: ${{ matrix.config.prefill.ep }}
+            prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }}
+            prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }}
+
+            decode-num-worker: ${{ matrix.config.decode.num-worker }}
+            decode-tp: ${{ matrix.config.decode.tp }}
+            decode-ep: ${{ matrix.config.decode.ep }}
+            decode-dp-attn: ${{ matrix.config.decode.dp-attn }}
+            decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }}
+
+    sweep-multi-node-1k8k:
+        needs: setup
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['1k8k']) != 'null' }}
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: multi-node 1k8k /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['1k8k'] }}
+        secrets: inherit
+        with: *multi-node-inputs
+
+    sweep-multi-node-8k1k:
+        needs: setup
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k']) != 'null' }}
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: multi-node 8k1k /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).multi_node['8k1k'] }}
+        secrets: inherit
+        with: *multi-node-inputs
+
+    sweep-single-node-1k1k:
+        needs: setup
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['1k1k']) != 'null' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: single-node 1k1k /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k1k'] }}
+        secrets: inherit
+        with: &single-node-inputs
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            model-prefix: ${{ matrix.config.model-prefix }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+            spec-decoding: ${{ matrix.config.spec-decoding }}
+            disagg: ${{ matrix.config.disagg }}
+
+    sweep-single-node-1k8k:
+        needs: setup
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['1k8k']) != 'null' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: single-node 1k8k /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['1k8k'] }}
+        secrets: inherit
+        with: *single-node-inputs
+
+    sweep-single-node-8k1k:
+        needs: setup
+        if: ${{ toJson(fromJson(needs.setup.outputs.search-space-config).single_node['8k1k']) != 'null' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: single-node 8k1k /
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.setup.outputs.search-space-config).single_node['8k1k'] }}
+        secrets: inherit
+        with: *single-node-inputs
+
+    collect-results:
+        needs:
+            [
+                sweep-single-node-1k1k,
+                sweep-single-node-1k8k,
+                sweep-single-node-8k1k,
+                sweep-multi-node-1k1k,
+                sweep-multi-node-1k8k,
+                sweep-multi-node-8k1k,
+                setup,
+            ]
+        if: ${{ always() && needs.setup.result != 'skipped' }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            result-prefix: "bmk"
+
+    upload-changelog-metadata:
+        needs: [setup, collect-results]
+        if: ${{ always() && needs.setup.result != 'skipped' }}
+        runs-on: ubuntu-latest
+        steps:
+            - name: Extract and save changelog metadata
+              env:
+                  CONFIG_JSON: ${{ needs.setup.outputs.search-space-config }}
+              run: |
+                  echo "$CONFIG_JSON" | jq '.changelog_metadata' > changelog_metadata.json
+
+            - name: Upload changelog artifact
+              uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+              with:
+                  name: changelog-metadata
+                  path: changelog_metadata.json
+
+    calc-success-rate:
+        needs: collect-results
+        if: ${{ always() && needs.collect-results.result != 'skipped'}}
+        runs-on: ubuntu-latest
+
+        env:
+            RESULTS_DIR: "results/"
+            STATS_FILENAME: "run_stats"
+            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
+
+        steps:
+            - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0
+              with:
+                  token: ${{ secrets.REPO_PAT }}
+                  fetch-depth: 0
+
+            - name: Download results artifacts
+              uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0
+              with:
+                  path: ${{ env.RESULTS_DIR }}
+                  pattern: results_*
+
+            - name: Install python dependencies
+              run: pip install PyGithub
+
+            - name: Calculate success rate
+              run: python3 utils/calc_success_rate.py $STATS_FILENAME
+
+            - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0
+              with:
+                  name: "run-stats"
+                  path: ${{ env.STATS_FILENAME }}.json
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -0,0 +1,83 @@
+- config-keys:
+    - 70b-fp8-*-vllm
+  description: |
+    - Add compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' as
+      extra config to all benchmarks/70b_fp8_mi*.sh scripts
+    - 6-7% uplift for llama for 6/8 configs
+    PR: https://github.com/InferenceMAX/InferenceMAX/pull/95
+- config-keys:
+    - gptoss-fp4-*-trt
+  description: |
+    - Upgrade GPT-OSS TRT images from 'release:1.1.0rc2.post2' to '1.2.0rc0.post1'
+    - Add NCCL_GRAPH_REGISTER=0 to benchmarks/gptoss_fp4_b200_trt_slurm.sh
+    - Change kv_cache_config.dtype from 'auto' to 'fp8' in benchmarks/gptoss_fp4_b200_trt_slurm.sh
+    - Remove MOE_BACKEND=CUTLASS, now just defaults to TRTLLM
+    PR: https://github.com/InferenceMAX/InferenceMAX/pull/110
+- config-keys:
+    - gptoss*
+    - dsr1*
+  description: |
+    - Remove Llama 70B runs to make room for multi-node disagg prefill+wideEP on 
+      h100/h200/b200/mi300/mi325/mi355
+    PR: https://github.com/InferenceMAX/InferenceMAX/pull/149
+- config-keys:
+    - gptoss-fp4-b200-vllm
+    - gptoss-fp4-h100-vllm
+    - gptoss-fp4-h200-vllm
+  description: |
+    - Upgrade vLLM from 0.10.2 to 0.11.0 for GPT-OSS NVIDIA single-node configs
+    - Adds compilation-config: '{"cudagraph_mode":"PIECEWISE"} accordingly since vLLM 0.11.0
+      requires now defaults to FULL_AND_PIECEWISE
+    PR: https://github.com/InferenceMAX/InferenceMAX/pull/159
+- config-keys:
+    - dsr1*
+  description: |
+    - Fixes bug where 1k8k and 8k1k full sweeps had incorrect max-model-len for DeepSeek
+    PR: https://github.com/InferenceMAX/InferenceMAX/pull/163
+- config-keys:
+    - dsr1-fp4-b200-sglang
+    - dsr1-fp8-b200-sglang
+    - dsr1-fp8-h200-sglang
+  description: |
+    - Consolidates H200 and B200 SGLang configurations to use unified v0.5.5-cu129-amd64 
+      image tag and updates deprecated SGLang server arguments to their current equivalents.
+    - --enable-flashinfer-trtllm-moe & --enable-ep-moe is no longer available in sglang so we needed to change it
+      - ep: 4 for all tp: 4 entries (3 occurrences in dsr1-fp4-b200-sglang)
+      - ep: 8 for all tp: 8 entries (6 occurrences across dsr1-fp4-b200-sglang and dsr1-fp8-b200-sglang)
+    - dsr1_fp4_b200_docker.sh: Replaced --enable-ep-moe with --ep-size $EP_SIZE and --enable-flashinfer-trtllm-moe with 
+      --moe-runner-backend flashinfer_trtllm
+    - dsr1_fp8_b200_docker.sh: Replaced --enable-flashinfer-trtllm-moe with --moe-runner-backend flashinfer_trtllm and 
+      added --ep-size $EP_SIZE
+    - launch_b200-nvd.sh: Added -e EP_SIZE to Docker run command to pass environment variable to container
+    - launch_b200-tg.sh: Added -e EP_SIZE to Docker run command to pass environment variable to container
+    PR: https://github.com/InferenceMAX/InferenceMAX/pull/204
+- config-keys:
+    - gptoss-fp4-mi355x-vllm
+    - gptoss-fp4-b200-vllm
+  description: |
+    - Extend concurrency to 128 for gptoss mi355x/b200 vllm configurations
+    PR: https://github.com/InferenceMAX/InferenceMAX/pull/209
+- config-keys:
+    - gptoss-fp4-b200-trt
+  description: |
+    - Extend concurrency to 128 for gptoss b200 TRT configurations
+    PR: https://github.com/InferenceMAX/InferenceMAX/pull/233
+- config-keys:
+    - "*gb200-sglang"
+  description: |
+    - Introducing some improvements in GB200 SGLang DSR1 submission
+    PR: https://github.com/InferenceMAX/InferenceMAX/pull/257
+- config-keys:
+    - dsr1-fp8-h200-trt
+  description: |
+    - Update TRT image from nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1 to nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc2
+    - Increase concurrency for some configurations
+    PR: https://github.com/InferenceMAX/InferenceMAX/pull/266
+- config-keys:
+    - gptoss-fp4-b200-vllm
+    - gptoss-fp4-h100-vllm
+    - gptoss-fp4-h200-vllm
+  description: |
+    - Update vLLM image for NVIDIA configs from vLLM 0.11.0 to vLLM 0.11.2
+    - Adds kv-cache-dtype: fp8 to benchmarks/gptoss_fp4_b200_docker.sh
+    PR: https://github.com/InferenceMAX/InferenceMAX/pull/273