From 4f9ee5e6c2ddf4e05a54362f5b1250a72d312dfa Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Wed, 3 Sep 2025 09:21:18 -0700
Subject: [PATCH 1/6] fix vllm launch

---
 .github/workflows/70b-tmpl.yml           | 154 +++++++++++------------
 .github/workflows/workflow-scheduler.yml | 102 +++++++--------
 benchmarks/70b_b200_slurm.sh             |   3 +-
 runners/launch_b200-nv.sh                |   2 +-
 4 files changed, 130 insertions(+), 131 deletions(-)

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index faa51d369..83717ede9 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -30,37 +30,37 @@ jobs:
       - name: Find the latest Docker image
         run: echo "Hardcoding image tags for now."
 
-  bmk-h100:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: h100
-      image: 'kedarpotdar147/vllm0.1:latest'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[2, 4, 8]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-h100:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: h100
+  #     image: 'kedarpotdar147/vllm0.1:latest'
+  #     model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+  #     tp-list: '[2, 4, 8]'
+  #     timeout: ${{ inputs.timeout }}
 
-  bmk-h200:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: h200
-      image: 'kedarpotdar147/vllm0.1:latest'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-h200:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: h200
+  #     image: 'kedarpotdar147/vllm0.1:latest'
+  #     model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+  #     tp-list: '[1, 2, 4, 8]'
+  #     timeout: ${{ inputs.timeout }}
 
   bmk-b200:
     needs: find-latest-image
@@ -75,59 +75,59 @@ jobs:
       runner: b200
       image: 'kedarpotdar147/vllm:05'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
+      tp-list: '[2]'
       timeout: ${{ inputs.timeout }}
 
-  bmk-mi300x:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi300x
-      image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
-      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-      tp-list: '[1, 2, 4, 8]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-mi300x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi300x
+  #     image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
+  #     model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+  #     tp-list: '[1, 2, 4, 8]'
+  #     timeout: ${{ inputs.timeout }}
 
-  bmk-mi325x:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi325x
-      image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
-      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-      tp-list: '[1, 2, 4, 8]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-mi325x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi325x
+  #     image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
+  #     model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+  #     tp-list: '[1, 2, 4, 8]'
+  #     timeout: ${{ inputs.timeout }}
 
-  bmk-mi355x:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi355x
-      image: 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.9.1_mi35x_alpha'
-      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-      tp-list: '[1, 2]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-mi355x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi355x
+  #     image: 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.9.1_mi35x_alpha'
+  #     model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+  #     tp-list: '[1, 2]'
+  #     timeout: ${{ inputs.timeout }}
 
   collect-results:
-    needs: [bmk-h100, bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x]
+    needs: [bmk-b200 ]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit
diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
index 115452ddf..84b1cfc26 100644
--- a/.github/workflows/workflow-scheduler.yml
+++ b/.github/workflows/workflow-scheduler.yml
@@ -24,58 +24,58 @@ jobs:
       max-model-len: 2048
       random-range-ratio: 0.8
   
-  dsr1-1k1k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
+  # dsr1-1k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_1k1k'
+  #     isl: 1024
+  #     osl: 1024
+  #     max-model-len: 2048
+  #     random-range-ratio: 0.8
 
-  _70b-8k1k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # _70b-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
   
-  dsr1-8k1k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # dsr1-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
 
-  _70b-1k8k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      timeout: 240
+  # _70b-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+  #     timeout: 240
 
-  dsr1-1k8k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # dsr1-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
diff --git a/benchmarks/70b_b200_slurm.sh b/benchmarks/70b_b200_slurm.sh
index f11133cc4..a07a3070e 100644
--- a/benchmarks/70b_b200_slurm.sh
+++ b/benchmarks/70b_b200_slurm.sh
@@ -20,8 +20,7 @@ hf download $MODEL
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 
-pip install "git+https://github.com/flashinfer-ai/flashinfer.git@9720182476ede910698f8d783c29b2ec91cec023#egg=flashinfer-python"
-pip install --upgrade --no-deps nvidia-nccl-cu12==2.26.2.post1
+pip install flashinfer-python==0.3.0
 
 export TORCH_CUDA_ARCH_LIST="10.0"
 export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
index 2808758db..e2e21d066 100644
--- a/runners/launch_b200-nv.sh
+++ b/runners/launch_b200-nv.sh
@@ -5,7 +5,7 @@ export PORT_OFFSET=${USER: -1}
 
 MODEL_CODE="${1%%_*}"
 PARTITION="dgx-b200"
-SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-2.sqsh"
+SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-vllm-3.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
 JOB_ID=$(squeue -u $USER -h -o %A | head -n1)

From 7e6577b06a74b65a362cb9dafbbeb754cfd942ef Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Wed, 3 Sep 2025 09:34:55 -0700
Subject: [PATCH 2/6] re-enable dsr1 and update image ID to re-fetch

---
 .github/workflows/dsr1-tmpl.yml          | 122 +++++++++++------------
 .github/workflows/workflow-scheduler.yml |  20 ++--
 runners/launch_b200-nv.sh                |   2 +-
 3 files changed, 72 insertions(+), 72 deletions(-)

diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml
index 07030c387..14872b044 100644
--- a/.github/workflows/dsr1-tmpl.yml
+++ b/.github/workflows/dsr1-tmpl.yml
@@ -30,21 +30,21 @@ jobs:
       - name: Find the latest Docker image
         run: echo "Hardcoding image tags for now."
 
-  bmk-h200:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: h200
-      image: 'lmsysorg/sglang:v0.4.9.post1-cu126'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      tp-list: '[8]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-h200:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: h200
+  #     image: 'lmsysorg/sglang:v0.4.9.post1-cu126'
+  #     model: 'deepseek-ai/DeepSeek-R1-0528'
+  #     tp-list: '[8]'
+  #     timeout: ${{ inputs.timeout }}
 
   bmk-b200:
     needs: find-latest-image
@@ -62,56 +62,56 @@ jobs:
       tp-list: '[8]'
       timeout: ${{ inputs.timeout }}
 
-  bmk-mi300x:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi300x
-      image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      tp-list: '[8]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-mi300x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi300x
+  #     image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
+  #     model: 'deepseek-ai/DeepSeek-R1-0528'
+  #     tp-list: '[8]'
+  #     timeout: ${{ inputs.timeout }}
 
-  bmk-mi325x:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi325x
-      image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      tp-list: '[8]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-mi325x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi325x
+  #     image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
+  #     model: 'deepseek-ai/DeepSeek-R1-0528'
+  #     tp-list: '[8]'
+  #     timeout: ${{ inputs.timeout }}
 
-  bmk-mi355x:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi355x
-      image: 'lmsysorg/sglang:v0.5.1.post2-rocm700-mi35x'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      tp-list: '[8]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-mi355x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi355x
+  #     image: 'lmsysorg/sglang:v0.5.1.post2-rocm700-mi35x'
+  #     model: 'deepseek-ai/DeepSeek-R1-0528'
+  #     tp-list: '[8]'
+  #     timeout: ${{ inputs.timeout }}
 
   collect-results:
-    needs: [bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x]
+    needs: [ bmk-b200,]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit
diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
index 84b1cfc26..a0964526b 100644
--- a/.github/workflows/workflow-scheduler.yml
+++ b/.github/workflows/workflow-scheduler.yml
@@ -24,16 +24,16 @@ jobs:
       max-model-len: 2048
       random-range-ratio: 0.8
   
-  # dsr1-1k1k:
-  #   needs: cleanup
-  #   uses: ./.github/workflows/dsr1-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: 'dsr1_1k1k'
-  #     isl: 1024
-  #     osl: 1024
-  #     max-model-len: 2048
-  #     random-range-ratio: 0.8
+  dsr1-1k1k:
+    needs: cleanup
+    uses: ./.github/workflows/dsr1-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: 'dsr1_1k1k'
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
 
   # _70b-8k1k:
   #   needs: cleanup
diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
index e2e21d066..8bf679d58 100644
--- a/runners/launch_b200-nv.sh
+++ b/runners/launch_b200-nv.sh
@@ -5,7 +5,7 @@ export PORT_OFFSET=${USER: -1}
 
 MODEL_CODE="${1%%_*}"
 PARTITION="dgx-b200"
-SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-vllm-3.sqsh"
+SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-0903.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
 JOB_ID=$(squeue -u $USER -h -o %A | head -n1)

From 22c9710ba61b163eb4540d1fd909bcd4c2b968fd Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Wed, 3 Sep 2025 14:55:14 -0700
Subject: [PATCH 3/6] rollback dsr1

---
 .github/workflows/dsr1-tmpl.yml |  2 +-
 benchmarks/dsr1_b200_docker.sh  | 72 ++++++++++++++++++++++++++-------
 benchmarks/dsr1_b200_slurm.sh   |  9 ++---
 3 files changed, 62 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml
index 14872b044..811c2c422 100644
--- a/.github/workflows/dsr1-tmpl.yml
+++ b/.github/workflows/dsr1-tmpl.yml
@@ -57,7 +57,7 @@ jobs:
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
       runner: b200
-      image: 'lmsysorg/sglang:v0.5.0rc1-cu128-b200'
+      image: 'lmsysorg/sglang:v0.4.10.post1-cu128-b200'
       model: 'deepseek-ai/DeepSeek-R1-0528'
       tp-list: '[8]'
       timeout: ${{ inputs.timeout }}
diff --git a/benchmarks/dsr1_b200_docker.sh b/benchmarks/dsr1_b200_docker.sh
index 68f83a169..df65471f6 100644
--- a/benchmarks/dsr1_b200_docker.sh
+++ b/benchmarks/dsr1_b200_docker.sh
@@ -1,20 +1,62 @@
 #!/usr/bin/env bash
 
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# PORT
-# TP
-# CONC
-# MAX_MODEL_LEN
-
-export SGL_ENABLE_JIT_DEEPGEMM=0
-export SGLANG_ENABLE_FLASHINFER_GEMM=1
+while [ -n "$(docker ps -aq)" ]; do
+    docker rm -f $(docker ps -aq)
+    docker network prune -f
+    sleep 5
+done
+
+network_name="bmk-net"
+server_name="bmk-server"
+client_name="bmk-client"
+port=8888
+
+docker network create $network_name
 
 set -x
-python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \
+docker run --rm -d --network $network_name --name $server_name \
+--runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
+-v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+-e HF_TOKEN=$HF_TOKEN -e HF_HUB_CACHE=$HF_HUB_CACHE -e SGL_ENABLE_JIT_DEEPGEMM=0 \
+--entrypoint=python3 \
+$IMAGE \
+-m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $port --trust-remote-code \
 --tensor-parallel-size=$TP --data-parallel-size=1 \
---kv-cache-dtype=fp8_e4m3 --mem-fraction-static=0.82 \
---max-prefill-tokens=32768 --chunked-prefill-size=32768 --cuda-graph-max-bs=128 --max-running-requests=128 \
---disable-radix-cache --enable-flashinfer-trtllm-moe --attention-backend=trtllm_mla --stream-interval=1
+--cuda-graph-max-bs 256 --max-running-requests 512 --mem-fraction-static 0.89 \
+--chunked-prefill-size 32768 --max-prefill-tokens 32768 \
+--disable-radix-cache --attention-backend trtllm_mla --disable-shared-experts-fusion --enable-flashinfer-trtllm-moe
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
+        docker stop $server_name
+        exit 1
+    fi
+    if [[ "$line" == *"The server is fired up and ready to roll!"* ]]; then
+        break
+    fi
+done < <(docker logs -f --tail=0 $server_name 2>&1)
+
+git clone https://github.com/kimbochen/bench_serving.git
+
+set -x
+docker run --rm --network $network_name --name $client_name \
+-v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ -e HF_TOKEN=$HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
+--entrypoint=python3 \
+$IMAGE \
+bench_serving/benchmark_serving.py \
+--model $MODEL  --backend vllm --base-url http://$server_name:$port \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) \
+--max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics "ttft,tpot,itl,e2el" \
+--result-dir /workspace/ --result-filename $RESULT_FILENAME.json
+
+while [ -n "$(docker ps -aq)" ]; do
+    docker stop $server_name
+    docker network rm $network_name
+    sleep 5
+done
\ No newline at end of file
diff --git a/benchmarks/dsr1_b200_slurm.sh b/benchmarks/dsr1_b200_slurm.sh
index 2aa45be79..9a1ba6271 100644
--- a/benchmarks/dsr1_b200_slurm.sh
+++ b/benchmarks/dsr1_b200_slurm.sh
@@ -7,13 +7,12 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 
 set -x
 PORT=$(( 8888 + $PORT_OFFSET ))
-export SGL_ENABLE_JIT_DEEPGEMM=false 
-export SGLANG_ENABLE_FLASHINFER_GEMM=true
+export SGL_ENABLE_JIT_DEEPGEMM=0
 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
 --tensor-parallel-size=$TP --data-parallel-size=1 \
---cuda-graph-max-bs 128 --max-running-requests 128 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 \
+--cuda-graph-max-bs 256 --max-running-requests 512 --mem-fraction-static 0.89 \
 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \
---disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 1 \
+--disable-radix-cache --attention-backend trtllm_mla --disable-shared-experts-fusion --enable-flashinfer-trtllm-moe \
 > $SERVER_LOG 2>&1 &
 
 set +x
@@ -41,4 +40,4 @@ python3 bench_serving/benchmark_serving.py \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
 --result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+--result-filename $RESULT_FILENAME.json
\ No newline at end of file

From 492de4cd041a7b90484bdba115d76126486b2ec7 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Wed, 3 Sep 2025 15:22:11 -0700
Subject: [PATCH 4/6] fix dsr1, remove 70b

---
 .github/workflows/70b-tmpl.yml  | 30 +++++++-------
 .github/workflows/dsr1-tmpl.yml |  2 +-
 benchmarks/dsr1_b200_docker.sh  | 72 +++++++--------------------------
 benchmarks/dsr1_b200_slurm.sh   | 43 +++++++++++++-------
 4 files changed, 59 insertions(+), 88 deletions(-)

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index 83717ede9..73ad841cf 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -62,21 +62,21 @@ jobs:
   #     tp-list: '[1, 2, 4, 8]'
   #     timeout: ${{ inputs.timeout }}
 
-  bmk-b200:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: b200
-      image: 'kedarpotdar147/vllm:05'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[2]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-b200:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: b200
+  #     image: 'kedarpotdar147/vllm:05'
+  #     model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+  #     tp-list: '[2]'
+  #     timeout: ${{ inputs.timeout }}
 
   # bmk-mi300x:
   #   needs: find-latest-image
diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml
index 811c2c422..14872b044 100644
--- a/.github/workflows/dsr1-tmpl.yml
+++ b/.github/workflows/dsr1-tmpl.yml
@@ -57,7 +57,7 @@ jobs:
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
       runner: b200
-      image: 'lmsysorg/sglang:v0.4.10.post1-cu128-b200'
+      image: 'lmsysorg/sglang:v0.5.0rc1-cu128-b200'
       model: 'deepseek-ai/DeepSeek-R1-0528'
       tp-list: '[8]'
       timeout: ${{ inputs.timeout }}
diff --git a/benchmarks/dsr1_b200_docker.sh b/benchmarks/dsr1_b200_docker.sh
index df65471f6..68f83a169 100644
--- a/benchmarks/dsr1_b200_docker.sh
+++ b/benchmarks/dsr1_b200_docker.sh
@@ -1,62 +1,20 @@
 #!/usr/bin/env bash
 
-while [ -n "$(docker ps -aq)" ]; do
-    docker rm -f $(docker ps -aq)
-    docker network prune -f
-    sleep 5
-done
-
-network_name="bmk-net"
-server_name="bmk-server"
-client_name="bmk-client"
-port=8888
-
-docker network create $network_name
+# ========= Required Env Vars =========
+# HF_TOKEN
+# HF_HUB_CACHE
+# MODEL
+# PORT
+# TP
+# CONC
+# MAX_MODEL_LEN
+
+export SGL_ENABLE_JIT_DEEPGEMM=0
+export SGLANG_ENABLE_FLASHINFER_GEMM=1
 
 set -x
-docker run --rm -d --network $network_name --name $server_name \
---runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
--v $HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--e HF_TOKEN=$HF_TOKEN -e HF_HUB_CACHE=$HF_HUB_CACHE -e SGL_ENABLE_JIT_DEEPGEMM=0 \
---entrypoint=python3 \
-$IMAGE \
--m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $port --trust-remote-code \
+python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \
 --tensor-parallel-size=$TP --data-parallel-size=1 \
---cuda-graph-max-bs 256 --max-running-requests 512 --mem-fraction-static 0.89 \
---chunked-prefill-size 32768 --max-prefill-tokens 32768 \
---disable-radix-cache --attention-backend trtllm_mla --disable-shared-experts-fusion --enable-flashinfer-trtllm-moe
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-        docker stop $server_name
-        exit 1
-    fi
-    if [[ "$line" == *"The server is fired up and ready to roll!"* ]]; then
-        break
-    fi
-done < <(docker logs -f --tail=0 $server_name 2>&1)
-
-git clone https://github.com/kimbochen/bench_serving.git
-
-set -x
-docker run --rm --network $network_name --name $client_name \
--v $GITHUB_WORKSPACE:/workspace/ -w /workspace/ -e HF_TOKEN=$HF_TOKEN -e PYTHONPYCACHEPREFIX=/tmp/pycache/ \
---entrypoint=python3 \
-$IMAGE \
-bench_serving/benchmark_serving.py \
---model $MODEL  --backend vllm --base-url http://$server_name:$port \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) \
---max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics "ttft,tpot,itl,e2el" \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json
-
-while [ -n "$(docker ps -aq)" ]; do
-    docker stop $server_name
-    docker network rm $network_name
-    sleep 5
-done
\ No newline at end of file
+--kv-cache-dtype=fp8_e4m3 --mem-fraction-static=0.82 \
+--max-prefill-tokens=32768 --chunked-prefill-size=32768 --cuda-graph-max-bs=128 --max-running-requests=128 \
+--disable-radix-cache --enable-flashinfer-trtllm-moe --attention-backend=trtllm_mla --stream-interval=1
diff --git a/benchmarks/dsr1_b200_slurm.sh b/benchmarks/dsr1_b200_slurm.sh
index 9a1ba6271..28e9e2a32 100644
--- a/benchmarks/dsr1_b200_slurm.sh
+++ b/benchmarks/dsr1_b200_slurm.sh
@@ -7,27 +7,40 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 
 set -x
 PORT=$(( 8888 + $PORT_OFFSET ))
-export SGL_ENABLE_JIT_DEEPGEMM=0
+export SGL_ENABLE_JIT_DEEPGEMM=false 
+export SGLANG_ENABLE_FLASHINFER_GEMM=true
 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
 --tensor-parallel-size=$TP --data-parallel-size=1 \
---cuda-graph-max-bs 256 --max-running-requests 512 --mem-fraction-static 0.89 \
+--cuda-graph-max-bs 128 --max-running-requests 128 --mem-fraction-static 0.82 --kv-cache-dtype fp8_e4m3 \
 --chunked-prefill-size 32768 --max-prefill-tokens 32768 \
---disable-radix-cache --attention-backend trtllm_mla --disable-shared-experts-fusion --enable-flashinfer-trtllm-moe \
+--disable-radix-cache --attention-backend trtllm_mla --enable-flashinfer-trtllm-moe --stream-interval 1 \
 > $SERVER_LOG 2>&1 &
 
 set +x
+IGNORE_PAT="Ignore import error when loading sglang.srt.models.glm4v_moe: No module named 'transformers.models.glm4v_moe'"
+
 while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-        sleep 5
-        tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
-    if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
+  printf '%s\n' "$line"
+
+  # Skip the known benign "Ignore import error ..." line
+  if [[ "$line" == *"$IGNORE_PAT"* ]]; then
+    continue
+  fi
+
+  # Keep your original "error" trap for everything else
+  if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
+    sleep 5
+    tail -n100 "$SERVER_LOG"
+    echo "JOB ${SLURM_JOB_ID:-NA} ran on NODE ${SLURMD_NODENAME:-unknown}"
+    exit 1
+  fi
+
+  # Break when server is ready
+  if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then
+    break
+  fi
+# Start tail from the beginning so we don't miss early lines
+done < <(tail -n +1 -F "$SERVER_LOG")
 
 set -x
 git clone https://github.com/kimbochen/bench_serving.git
@@ -40,4 +53,4 @@ python3 bench_serving/benchmark_serving.py \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
 --result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
\ No newline at end of file
+--result-filename $RESULT_FILENAME.json

From 2e21fe9f309a867bd8315ce769f88ead4711feca Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Wed, 3 Sep 2025 15:23:24 -0700
Subject: [PATCH 5/6] readd 70b

---
 .github/workflows/70b-tmpl.yml | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index 73ad841cf..3a2726bbe 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -62,21 +62,21 @@ jobs:
   #     tp-list: '[1, 2, 4, 8]'
   #     timeout: ${{ inputs.timeout }}
 
-  # bmk-b200:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: b200
-  #     image: 'kedarpotdar147/vllm:05'
-  #     model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-  #     tp-list: '[2]'
-  #     timeout: ${{ inputs.timeout }}
+  bmk-b200:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: b200
+      image: 'kedarpotdar147/vllm:05'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      tp-list: '[8]'
+      timeout: ${{ inputs.timeout }}
 
   # bmk-mi300x:
   #   needs: find-latest-image

From 594bc88792e3bdb13110be465d41cb1c9e73bf20 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Wed, 3 Sep 2025 16:15:44 -0700
Subject: [PATCH 6/6] re-add other tests

---
 .github/workflows/70b-tmpl.yml  | 154 ++++++++++++++++----------------
 .github/workflows/dsr1-tmpl.yml | 122 ++++++++++++-------------
 2 files changed, 138 insertions(+), 138 deletions(-)

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index 3a2726bbe..921909434 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -30,37 +30,37 @@ jobs:
       - name: Find the latest Docker image
         run: echo "Hardcoding image tags for now."
 
-  # bmk-h100:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: h100
-  #     image: 'kedarpotdar147/vllm0.1:latest'
-  #     model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-  #     tp-list: '[2, 4, 8]'
-  #     timeout: ${{ inputs.timeout }}
+  bmk-h100:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: h100
+      image: 'kedarpotdar147/vllm0.1:latest'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      tp-list: '[2, 4, 8]'
+      timeout: ${{ inputs.timeout }}
 
-  # bmk-h200:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: h200
-  #     image: 'kedarpotdar147/vllm0.1:latest'
-  #     model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-  #     tp-list: '[1, 2, 4, 8]'
-  #     timeout: ${{ inputs.timeout }}
+  bmk-h200:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: h200
+      image: 'kedarpotdar147/vllm0.1:latest'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      tp-list: '[1, 2, 4, 8]'
+      timeout: ${{ inputs.timeout }}
 
   bmk-b200:
     needs: find-latest-image
@@ -75,59 +75,59 @@ jobs:
       runner: b200
       image: 'kedarpotdar147/vllm:05'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[8]'
+      tp-list: '[1,2]'
       timeout: ${{ inputs.timeout }}
 
-  # bmk-mi300x:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: mi300x
-  #     image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
-  #     model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-  #     tp-list: '[1, 2, 4, 8]'
-  #     timeout: ${{ inputs.timeout }}
+  bmk-mi300x:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: mi300x
+      image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
+      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+      tp-list: '[1, 2, 4, 8]'
+      timeout: ${{ inputs.timeout }}
 
-  # bmk-mi325x:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: mi325x
-  #     image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
-  #     model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-  #     tp-list: '[1, 2, 4, 8]'
-  #     timeout: ${{ inputs.timeout }}
+  bmk-mi325x:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: mi325x
+      image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
+      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+      tp-list: '[1, 2, 4, 8]'
+      timeout: ${{ inputs.timeout }}
 
-  # bmk-mi355x:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: mi355x
-  #     image: 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.9.1_mi35x_alpha'
-  #     model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-  #     tp-list: '[1, 2]'
-  #     timeout: ${{ inputs.timeout }}
+  bmk-mi355x:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: mi355x
+      image: 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.9.1_mi35x_alpha'
+      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+      tp-list: '[1, 2]'
+      timeout: ${{ inputs.timeout }}
 
   collect-results:
-    needs: [bmk-b200 ]
+    needs: [bmk-h100,bmk-h200,bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit
diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml
index 14872b044..59e81c38b 100644
--- a/.github/workflows/dsr1-tmpl.yml
+++ b/.github/workflows/dsr1-tmpl.yml
@@ -30,21 +30,21 @@ jobs:
       - name: Find the latest Docker image
         run: echo "Hardcoding image tags for now."
 
-  # bmk-h200:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: h200
-  #     image: 'lmsysorg/sglang:v0.4.9.post1-cu126'
-  #     model: 'deepseek-ai/DeepSeek-R1-0528'
-  #     tp-list: '[8]'
-  #     timeout: ${{ inputs.timeout }}
+  bmk-h200:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: h200
+      image: 'lmsysorg/sglang:v0.4.9.post1-cu126'
+      model: 'deepseek-ai/DeepSeek-R1-0528'
+      tp-list: '[8]'
+      timeout: ${{ inputs.timeout }}
 
   bmk-b200:
     needs: find-latest-image
@@ -62,56 +62,56 @@ jobs:
       tp-list: '[8]'
       timeout: ${{ inputs.timeout }}
 
-  # bmk-mi300x:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: mi300x
-  #     image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
-  #     model: 'deepseek-ai/DeepSeek-R1-0528'
-  #     tp-list: '[8]'
-  #     timeout: ${{ inputs.timeout }}
+  bmk-mi300x:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: mi300x
+      image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
+      model: 'deepseek-ai/DeepSeek-R1-0528'
+      tp-list: '[8]'
+      timeout: ${{ inputs.timeout }}
 
-  # bmk-mi325x:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: mi325x
-  #     image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
-  #     model: 'deepseek-ai/DeepSeek-R1-0528'
-  #     tp-list: '[8]'
-  #     timeout: ${{ inputs.timeout }}
+  bmk-mi325x:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: mi325x
+      image: 'lmsysorg/sglang:v0.4.9.post2-rocm630-mi30x'
+      model: 'deepseek-ai/DeepSeek-R1-0528'
+      tp-list: '[8]'
+      timeout: ${{ inputs.timeout }}
 
-  # bmk-mi355x:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: mi355x
-  #     image: 'lmsysorg/sglang:v0.5.1.post2-rocm700-mi35x'
-  #     model: 'deepseek-ai/DeepSeek-R1-0528'
-  #     tp-list: '[8]'
-  #     timeout: ${{ inputs.timeout }}
+  bmk-mi355x:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: mi355x
+      image: 'lmsysorg/sglang:v0.5.1.post2-rocm700-mi35x'
+      model: 'deepseek-ai/DeepSeek-R1-0528'
+      tp-list: '[8]'
+      timeout: ${{ inputs.timeout }}
 
   collect-results:
-    needs: [ bmk-b200,]
+    needs: [ bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x, bmk-mi355x]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit