From d556b8862a6c54c6ff393fef4c7719f1e0aa6fab Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 09:58:28 -0700
Subject: [PATCH 01/28] add trt init for 70b

---
 .github/workflows/70b-tmpl.yml   | 18 ++++++++-
 benchmarks/70b_b200-trt_slurm.sh | 63 ++++++++++++++++++++++++++++++++
 runners/launch_b200-trt.sh       | 23 ++++++++++++
 3 files changed, 103 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/70b_b200-trt_slurm.sh
 create mode 100644 runners/launch_b200-trt.sh

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index 23ad88551..d196fc575 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -78,6 +78,22 @@ jobs:
       tp-list: '[1, 2, 4, 8]'
       timeout: ${{ inputs.timeout }}
 
+  bmk-b200-trt:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: b200-trt
+      image: 'nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc0'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      tp-list: '[1, 2, 4, 8]'
+      timeout: ${{ inputs.timeout }}
+
   bmk-mi300x:
     needs: find-latest-image
     uses: ./.github/workflows/benchmark-tmpl.yml
@@ -111,7 +127,7 @@ jobs:
       timeout: ${{ inputs.timeout }}
 
   collect-results:
-    needs: [bmk-h100, bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x]
+    needs: [bmk-h100, bmk-h200, bmk-b200, bmk-b200-trt, bmk-mi300x, bmk-mi325x]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit
diff --git a/benchmarks/70b_b200-trt_slurm.sh b/benchmarks/70b_b200-trt_slurm.sh
new file mode 100644
index 000000000..0286b9d5c
--- /dev/null
+++ b/benchmarks/70b_b200-trt_slurm.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+set -x
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+# Create llama-config.yml inline
+cat > llama-config.yml << 'EOF'
+enable_attention_dp: false 
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 4
+EOF
+
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
+        sleep 5
+        tail -n100 $SERVER_LOG
+        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
+        exit 1
+    fi
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/runners/launch_b200-trt.sh b/runners/launch_b200-trt.sh
new file mode 100644
index 000000000..ec53ea7c8
--- /dev/null
+++ b/runners/launch_b200-trt.sh
@@ -0,0 +1,23 @@
+#!/usr/bin/bash
+
+export HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/"
+export PORT_OFFSET=${USER: -1}
+
+MODEL_CODE="${1%%_*}"
+PARTITION="dgx-b200"
+SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-trt.sqsh"
+
+salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
+JOB_ID=$(squeue -u $USER -h -o %A)
+
+set -x
+srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
+srun --jobid=$JOB_ID \
+--container-image=$SQUASH_FILE \
+--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+--container-mount-home \
+--container-workdir=/workspace/ \
+--no-container-entrypoint --export=ALL \
+bash benchmarks/${MODEL_CODE}_b200-trt_slurm.sh
+
+scancel $JOB_ID

From 426f48e4ef3cc447bbce2f2ff794ac4517fc149b Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 10:19:09 -0700
Subject: [PATCH 02/28] remove dsr1 and add $MAX_MODEL_LEN to launch configs

---
 .github/workflows/workflow-scheduler.yml | 60 ++++++++++++------------
 benchmarks/70b_b200-trt_slurm.sh         |  2 +-
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
index ce03740fc..c952000d3 100644
--- a/.github/workflows/workflow-scheduler.yml
+++ b/.github/workflows/workflow-scheduler.yml
@@ -24,16 +24,16 @@ jobs:
       max-model-len: 2048
       random-range-ratio: 0.8
   
-  dsr1-1k1k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
+  # dsr1-1k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_1k1k'
+  #     isl: 1024
+  #     osl: 1024
+  #     max-model-len: 2048
+  #     random-range-ratio: 0.8
 
   _70b-8k1k:
     needs: cleanup
@@ -46,16 +46,16 @@ jobs:
       max-model-len: 9216
       random-range-ratio: 0.8
   
-  dsr1-8k1k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # dsr1-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
 
   _70b-1k8k:
     needs: cleanup
@@ -69,13 +69,13 @@ jobs:
       random-range-ratio: 0.8
       timeout: 240
 
-  dsr1-1k8k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # dsr1-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
diff --git a/benchmarks/70b_b200-trt_slurm.sh b/benchmarks/70b_b200-trt_slurm.sh
index 0286b9d5c..3540fb9ce 100644
--- a/benchmarks/70b_b200-trt_slurm.sh
+++ b/benchmarks/70b_b200-trt_slurm.sh
@@ -33,7 +33,7 @@ kv_cache_config:
 stream_interval: 4
 EOF
 
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml > $SERVER_LOG 2>&1 &
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN--num_postprocess_workers 2 --extra_llm_api_options llama-config.yml > $SERVER_LOG 2>&1 &
 
 set +x
 while IFS= read -r line; do

From 12a7f6e4a134edc7623ca2851c4f927687333069 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 10:32:00 -0700
Subject: [PATCH 03/28] remove b200 tg

---
 .github/workflows/cluster-cleanup.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/cluster-cleanup.yml b/.github/workflows/cluster-cleanup.yml
index e0f30ae17..a74311d9f 100644
--- a/.github/workflows/cluster-cleanup.yml
+++ b/.github/workflows/cluster-cleanup.yml
@@ -47,7 +47,7 @@ jobs:
         runner:
           - 'h100-cr_0'
           - 'h100-cr_1'
-          - 'b200-tg_0'
+          # - 'b200-tg_0'
           - 'mi300x-cr_0'
           - 'mi300x-amd_0'
           - 'mi300x-amd_1'

From 0fc8ab472528a28d1c261cc80e98063eb1c17204 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 11:21:24 -0700
Subject: [PATCH 04/28] add RUNNER LABEL and temporarily remove bmk-b200?

---
 .github/workflows/70b-tmpl.yml       | 32 ++++++++++++++--------------
 .github/workflows/benchmark-tmpl.yml |  1 +
 benchmarks/70b_b200-trt_slurm.sh     |  2 +-
 runners/launch_b200-nv.sh            |  2 +-
 runners/launch_b200-trt.sh           | 23 --------------------
 5 files changed, 19 insertions(+), 41 deletions(-)
 delete mode 100644 runners/launch_b200-trt.sh

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index d196fc575..76c062773 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -62,21 +62,21 @@ jobs:
       tp-list: '[1, 2, 4, 8]'
       timeout: ${{ inputs.timeout }}
 
-  bmk-b200:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: b200
-      image: 'kedarpotdar147/vllm0.1:latest'
-      model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-b200:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: b200
+  #     image: 'kedarpotdar147/vllm0.1:latest'
+  #     model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
+  #     tp-list: '[1, 2, 4, 8]'
+  #     timeout: ${{ inputs.timeout }}
 
   bmk-b200-trt:
     needs: find-latest-image
@@ -127,7 +127,7 @@ jobs:
       timeout: ${{ inputs.timeout }}
 
   collect-results:
-    needs: [bmk-h100, bmk-h200, bmk-b200, bmk-b200-trt, bmk-mi300x, bmk-mi325x]
+    needs: [bmk-h100, bmk-h200, bmk-b200-trt, bmk-mi300x, bmk-mi325x]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 7e4e0b708..e9b37cb3a 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -43,6 +43,7 @@ env:
   MAX_MODEL_LEN: ${{ inputs.max-model-len }}
   RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }}
   IMAGE: ${{ inputs.image }}
+  RUNNER_LABEL: ${{ inputs.runner }}
 
 jobs:
   benchmark:
diff --git a/benchmarks/70b_b200-trt_slurm.sh b/benchmarks/70b_b200-trt_slurm.sh
index 3540fb9ce..e2461aac6 100644
--- a/benchmarks/70b_b200-trt_slurm.sh
+++ b/benchmarks/70b_b200-trt_slurm.sh
@@ -33,7 +33,7 @@ kv_cache_config:
 stream_interval: 4
 EOF
 
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN--num_postprocess_workers 2 --extra_llm_api_options llama-config.yml > $SERVER_LOG 2>&1 &
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml > $SERVER_LOG 2>&1 &
 
 set +x
 while IFS= read -r line; do
diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
index 83f1ec801..5acd79743 100644
--- a/runners/launch_b200-nv.sh
+++ b/runners/launch_b200-nv.sh
@@ -18,6 +18,6 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/${MODEL_CODE}_b200_slurm.sh
+bash benchmarks/${MODEL_CODE}_${RUNNER_LABEL}_slurm.sh
 
 scancel $JOB_ID
diff --git a/runners/launch_b200-trt.sh b/runners/launch_b200-trt.sh
deleted file mode 100644
index ec53ea7c8..000000000
--- a/runners/launch_b200-trt.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/bash
-
-export HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/"
-export PORT_OFFSET=${USER: -1}
-
-MODEL_CODE="${1%%_*}"
-PARTITION="dgx-b200"
-SQUASH_FILE="/raid/image_${MODEL_CODE}_b200-trt.sqsh"
-
-salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
-JOB_ID=$(squeue -u $USER -h -o %A)
-
-set -x
-srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
-srun --jobid=$JOB_ID \
---container-image=$SQUASH_FILE \
---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
---container-mount-home \
---container-workdir=/workspace/ \
---no-container-entrypoint --export=ALL \
-bash benchmarks/${MODEL_CODE}_b200-trt_slurm.sh
-
-scancel $JOB_ID

From 4b30c03579959ef8c46cbf4ed8cba9a2a231e49c Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 11:39:45 -0700
Subject: [PATCH 05/28] fix per kimbo's suggestion

---
 .github/workflows/70b-tmpl.yml       | 32 ++++++++++++++--------------
 .github/workflows/benchmark-tmpl.yml |  2 +-
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index 76c062773..d196fc575 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -62,21 +62,21 @@ jobs:
       tp-list: '[1, 2, 4, 8]'
       timeout: ${{ inputs.timeout }}
 
-  # bmk-b200:
-  #   needs: find-latest-image
-  #   uses: ./.github/workflows/benchmark-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: ${{ inputs.exp-name }}
-  #     isl: ${{ inputs.isl }}
-  #     osl: ${{ inputs.osl }}
-  #     max-model-len: ${{ inputs.max-model-len }}
-  #     random-range-ratio: ${{ inputs.random-range-ratio }}
-  #     runner: b200
-  #     image: 'kedarpotdar147/vllm0.1:latest'
-  #     model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
-  #     tp-list: '[1, 2, 4, 8]'
-  #     timeout: ${{ inputs.timeout }}
+  bmk-b200:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: b200
+      image: 'kedarpotdar147/vllm0.1:latest'
+      model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
+      tp-list: '[1, 2, 4, 8]'
+      timeout: ${{ inputs.timeout }}
 
   bmk-b200-trt:
     needs: find-latest-image
@@ -127,7 +127,7 @@ jobs:
       timeout: ${{ inputs.timeout }}
 
   collect-results:
-    needs: [bmk-h100, bmk-h200, bmk-b200-trt, bmk-mi300x, bmk-mi325x]
+    needs: [bmk-h100, bmk-h200, bmk-b200, bmk-b200-trt, bmk-mi300x, bmk-mi325x]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index e9b37cb3a..818c34f43 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -43,7 +43,6 @@ env:
   MAX_MODEL_LEN: ${{ inputs.max-model-len }}
   RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }}
   IMAGE: ${{ inputs.image }}
-  RUNNER_LABEL: ${{ inputs.runner }}
 
 jobs:
   benchmark:
@@ -75,6 +74,7 @@ jobs:
       - name: Launch job script
         run: |
           RUNNER_NAME=${{ runner.name }}
+          RUNNER_LABEL=${{ inputs.runner }}
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh ${{ inputs.exp-name }}
 
       - name: Process result

From aab2320fc667e9932013e03f26b54facf332db6e Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 12:00:52 -0700
Subject: [PATCH 06/28] revert local runner var

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 818c34f43..e9b37cb3a 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -43,6 +43,7 @@ env:
   MAX_MODEL_LEN: ${{ inputs.max-model-len }}
   RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }}
   IMAGE: ${{ inputs.image }}
+  RUNNER_LABEL: ${{ inputs.runner }}
 
 jobs:
   benchmark:
@@ -74,7 +75,6 @@ jobs:
       - name: Launch job script
         run: |
           RUNNER_NAME=${{ runner.name }}
-          RUNNER_LABEL=${{ inputs.runner }}
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh ${{ inputs.exp-name }}
 
       - name: Process result

From 0c5ad16c803a5bac2ec1d668add1e5be77a335e6 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 12:54:45 -0700
Subject: [PATCH 07/28] update sqsh file name to include runner name. i.e. trt

---
 runners/launch_b200-nv.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
index 5acd79743..576b4f660 100644
--- a/runners/launch_b200-nv.sh
+++ b/runners/launch_b200-nv.sh
@@ -5,7 +5,7 @@ export PORT_OFFSET=${USER: -1}
 
 MODEL_CODE="${1%%_*}"
 PARTITION="dgx-b200"
-SQUASH_FILE="/raid/image_${MODEL_CODE}_b200.sqsh"
+SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
 JOB_ID=$(squeue -u $USER -h -o %A)

From 7487baa8ccb0c80b05199d0fb04c3e04095cc7d9 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 13:06:10 -0700
Subject: [PATCH 08/28] temporarily remove other benchmarks. only keep
 bmk-b200-trt

---
 .github/workflows/70b-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index d196fc575..03f606de6 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -127,7 +127,7 @@ jobs:
       timeout: ${{ inputs.timeout }}
 
   collect-results:
-    needs: [bmk-h100, bmk-h200, bmk-b200, bmk-b200-trt, bmk-mi300x, bmk-mi325x]
+    needs: [ bmk-b200-trt]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit

From 1233b53434d081a21c60a84e5fac4329558d3bd2 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 15:00:57 -0700
Subject: [PATCH 09/28] refactor scheduler to add trt tag, update ngc image
 address , update summarize.py to reflect backend, fix issue with result
 filename

---
 .github/workflows/70b-tmpl.yml           |  2 +-
 .github/workflows/benchmark-tmpl.yml     | 10 ++++---
 .github/workflows/workflow-scheduler.yml | 34 ++++++++++++++++++++++++
 utils/summarize.py                       | 10 +++++--
 4 files changed, 49 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index 03f606de6..a5ed09b57 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -89,7 +89,7 @@ jobs:
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
       runner: b200-trt
-      image: 'nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc0'
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc0'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
       tp-list: '[1, 2, 4, 8]'
       timeout: ${{ inputs.timeout }}
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index e9b37cb3a..ab34d37cf 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -69,7 +69,7 @@ jobs:
 
       - name: Set result filename
         run: |
-          RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }}
+          RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}
           echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
 
       - name: Launch job script
@@ -78,10 +78,12 @@ jobs:
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh ${{ inputs.exp-name }}
 
       - name: Process result
-        run: python3 utils/process_result.py ${{ inputs.runner }} ${{ env.TP }} ${{ env.RESULT_FILENAME }}
+        run: |
+          RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}
+          python3 utils/process_result.py ${{ inputs.runner }} ${{ env.TP }} $RESULT_FILENAME
 
       - name: Upload result
         uses: actions/upload-artifact@v4
         with:
-          name: ${{ env.RESULT_FILENAME }}
-          path: agg_${{ env.RESULT_FILENAME }}.json
+          name: ${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}
+          path: agg_${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}.json
diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
index c952000d3..c8c188a07 100644
--- a/.github/workflows/workflow-scheduler.yml
+++ b/.github/workflows/workflow-scheduler.yml
@@ -23,6 +23,17 @@ jobs:
       osl: 1024
       max-model-len: 2048
       random-range-ratio: 0.8
+
+  _70b-trt-1k1k:
+    needs: cleanup
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b-trt_1k1k'
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
   
   # dsr1-1k1k:
   #   needs: cleanup
@@ -45,6 +56,17 @@ jobs:
       osl: 1024
       max-model-len: 9216
       random-range-ratio: 0.8
+
+  _70b-trt-8k1k:
+    needs: cleanup
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b-trt_8k1k'
+      isl: 8192
+      osl: 1024
+      max-model-len: 9216
+      random-range-ratio: 0.8
   
   # dsr1-8k1k:
   #   needs: cleanup
@@ -69,6 +91,18 @@ jobs:
       random-range-ratio: 0.8
       timeout: 240
 
+  _70b-trt-1k8k:
+    needs: cleanup
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b-trt_1k8k'
+      isl: 1024
+      osl: 8192
+      max-model-len: 9216
+      random-range-ratio: 0.8
+      timeout: 240
+
   # dsr1-1k8k:
   #   needs: cleanup
   #   uses: ./.github/workflows/dsr1-tmpl.yml
diff --git a/utils/summarize.py b/utils/summarize.py
index 20d9ae127..245c77de5 100644
--- a/utils/summarize.py
+++ b/utils/summarize.py
@@ -12,14 +12,20 @@
 results.sort(key=lambda r: (r['hw'], r['tp'], r['conc']))
 
 summary_header = f'''\
-| Hardware | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
-| :-: | :-: | :-: | :-: | :-: | :-: | :-: |\
+| Hardware | Framework | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
+| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\
 '''
 print(summary_header)
 
 for result in results:
+    # Extract framework from experiment name or runner
+    framework = 'vLLM'  # default
+    if 'trt' in result.get('exp_name', '').lower() or 'trt' in result.get('runner', '').lower():
+        framework = 'TRT-LLM'
+    
     print(
         f"| {result['hw'].upper()} "
+        f"| {framework} "
         f"| {result['tp']} "
         f"| {result['conc']} "
         f"| {(result['median_ttft'] * 1000):.4f} "

From 7800006ac9556da77b57af249000ced89c3619a8 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 15:07:39 -0700
Subject: [PATCH 10/28] refactor trt into separate yml

---
 .github/workflows/70b-trt-tmpl.yml       | 55 ++++++++++++++++++++++++
 .github/workflows/workflow-scheduler.yml |  6 +--
 2 files changed, 58 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/70b-trt-tmpl.yml

diff --git a/.github/workflows/70b-trt-tmpl.yml b/.github/workflows/70b-trt-tmpl.yml
new file mode 100644
index 000000000..c0f5bfe0f
--- /dev/null
+++ b/.github/workflows/70b-trt-tmpl.yml
@@ -0,0 +1,55 @@
+name: LLaMA 70B TRT-LLM Template
+
+on:
+  workflow_call:
+    inputs:
+      exp-name:
+        required: true
+        type: string
+      isl:
+        required: true
+        type: string
+      osl:
+        required: true
+        type: string
+      max-model-len:
+        required: true
+        type: string
+      random-range-ratio:
+        required: true
+        type: string
+      timeout:
+        required: false
+        type: number
+        default: 180
+
+jobs:
+  find-latest-image:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Find the latest Docker image
+        run: echo "Hardcoding image tags for now."
+
+  bmk-b200-trt:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: b200-trt
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc0'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      tp-list: '[1, 2, 4, 8]'
+      timeout: ${{ inputs.timeout }}
+
+  collect-results:
+    needs: [bmk-b200-trt]
+    if: ${{ always() && !cancelled() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
index c8c188a07..5039b1b73 100644
--- a/.github/workflows/workflow-scheduler.yml
+++ b/.github/workflows/workflow-scheduler.yml
@@ -26,7 +26,7 @@ jobs:
 
   _70b-trt-1k1k:
     needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
+    uses: ./.github/workflows/70b-trt-tmpl.yml
     secrets: inherit
     with:
       exp-name: '70b-trt_1k1k'
@@ -59,7 +59,7 @@ jobs:
 
   _70b-trt-8k1k:
     needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
+    uses: ./.github/workflows/70b-trt-tmpl.yml
     secrets: inherit
     with:
       exp-name: '70b-trt_8k1k'
@@ -93,7 +93,7 @@ jobs:
 
   _70b-trt-1k8k:
     needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
+    uses: ./.github/workflows/70b-trt-tmpl.yml
     secrets: inherit
     with:
       exp-name: '70b-trt_1k8k'

From 43057dde569ed5f14f591be8375d14e7d6c0d23e Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 15:13:21 -0700
Subject: [PATCH 11/28] fix file name

---
 .github/workflows/70b-tmpl.yml                 | 18 +-----------------
 ...-trt_slurm.sh => 70b-trt_b200-trt_slurm.sh} |  0
 2 files changed, 1 insertion(+), 17 deletions(-)
 rename benchmarks/{70b_b200-trt_slurm.sh => 70b-trt_b200-trt_slurm.sh} (100%)

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index a5ed09b57..23ad88551 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -78,22 +78,6 @@ jobs:
       tp-list: '[1, 2, 4, 8]'
       timeout: ${{ inputs.timeout }}
 
-  bmk-b200-trt:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: b200-trt
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc0'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
-      timeout: ${{ inputs.timeout }}
-
   bmk-mi300x:
     needs: find-latest-image
     uses: ./.github/workflows/benchmark-tmpl.yml
@@ -127,7 +111,7 @@ jobs:
       timeout: ${{ inputs.timeout }}
 
   collect-results:
-    needs: [ bmk-b200-trt]
+    needs: [bmk-h100, bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit
diff --git a/benchmarks/70b_b200-trt_slurm.sh b/benchmarks/70b-trt_b200-trt_slurm.sh
similarity index 100%
rename from benchmarks/70b_b200-trt_slurm.sh
rename to benchmarks/70b-trt_b200-trt_slurm.sh

From a94fbd0e8f7b821f0b53fc2b4f33821ecd39101b Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 15:15:22 -0700
Subject: [PATCH 12/28] comment vllm for now

---
 .github/workflows/workflow-scheduler.yml | 62 ++++++++++++------------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
index 5039b1b73..7a631376c 100644
--- a/.github/workflows/workflow-scheduler.yml
+++ b/.github/workflows/workflow-scheduler.yml
@@ -13,16 +13,16 @@ jobs:
   cleanup:
     uses: ./.github/workflows/cluster-cleanup.yml
 
-  _70b-1k1k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
+  # _70b-1k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_1k1k'
+  #     isl: 1024
+  #     osl: 1024
+  #     max-model-len: 2048
+  #     random-range-ratio: 0.8
 
   _70b-trt-1k1k:
     needs: cleanup
@@ -46,16 +46,16 @@ jobs:
   #     max-model-len: 2048
   #     random-range-ratio: 0.8
 
-  _70b-8k1k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # _70b-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
 
   _70b-trt-8k1k:
     needs: cleanup
@@ -79,17 +79,17 @@ jobs:
   #     max-model-len: 9216
   #     random-range-ratio: 0.8
 
-  _70b-1k8k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      timeout: 240
+  # _70b-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+  #     timeout: 240
 
   _70b-trt-1k8k:
     needs: cleanup

From 0225b1026649b213d2ef2cd31f513ae4f206e3d7 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 15:25:33 -0700
Subject: [PATCH 13/28] update port in trtllm-serve

---
 benchmarks/70b-trt_b200-trt_slurm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/70b-trt_b200-trt_slurm.sh b/benchmarks/70b-trt_b200-trt_slurm.sh
index e2461aac6..5f91bb2e2 100644
--- a/benchmarks/70b-trt_b200-trt_slurm.sh
+++ b/benchmarks/70b-trt_b200-trt_slurm.sh
@@ -33,7 +33,7 @@ kv_cache_config:
 stream_interval: 4
 EOF
 
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml > $SERVER_LOG 2>&1 &
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
 
 set +x
 while IFS= read -r line; do

From 1e594f30fcd372124531ccd020da02cc3bbab17d Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 16:07:11 -0700
Subject: [PATCH 14/28] update artifact name to have runner name  at end

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index ab34d37cf..78bd69c10 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -85,5 +85,5 @@ jobs:
       - name: Upload result
         uses: actions/upload-artifact@v4
         with:
-          name: ${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}
+          name: ${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }}
           path: agg_${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}.json

From f63768ca0f6959f7e9add440892714ad90911d15 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 20:14:51 -0700
Subject: [PATCH 15/28] update plot function with b200-trt

---
 utils/plot_perf.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/plot_perf.py b/utils/plot_perf.py
index 35eb46eb2..7dd12dc59 100644
--- a/utils/plot_perf.py
+++ b/utils/plot_perf.py
@@ -10,6 +10,7 @@
     'h100': 'lightgreen',
     'h200': 'green',
     'b200': 'black',
+    'b200-trt': 'darkblue',
     'mi300x': 'pink',
     'mi325x': 'red',
     'mi355x': 'purple'

From ed20d230dd3d8e3e8787ce674ca3c028bc213c5c Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 20:26:37 -0700
Subject: [PATCH 16/28] add h200 trt

---
 .github/workflows/70b-trt-tmpl.yml | 18 ++++++++-
 benchmarks/70b-h200-trt_slurm.sh   | 63 ++++++++++++++++++++++++++++++
 utils/plot_perf.py                 |  1 +
 3 files changed, 81 insertions(+), 1 deletion(-)
 create mode 100644 benchmarks/70b-h200-trt_slurm.sh

diff --git a/.github/workflows/70b-trt-tmpl.yml b/.github/workflows/70b-trt-tmpl.yml
index c0f5bfe0f..8c7184443 100644
--- a/.github/workflows/70b-trt-tmpl.yml
+++ b/.github/workflows/70b-trt-tmpl.yml
@@ -46,8 +46,24 @@ jobs:
       tp-list: '[1, 2, 4, 8]'
       timeout: ${{ inputs.timeout }}
 
+  bmk-h200-trt:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: h200-trt
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc0'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      tp-list: '[2, 4, 8]'
+      timeout: ${{ inputs.timeout }}
+
   collect-results:
-    needs: [bmk-b200-trt]
+    needs: [bmk-b200-trt, bmk-h200-trt]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit
diff --git a/benchmarks/70b-h200-trt_slurm.sh b/benchmarks/70b-h200-trt_slurm.sh
new file mode 100644
index 000000000..5f91bb2e2
--- /dev/null
+++ b/benchmarks/70b-h200-trt_slurm.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+set -x
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+# Create llama-config.yml inline
+cat > llama-config.yml << 'EOF'
+enable_attention_dp: false 
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 4
+EOF
+
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
+        sleep 5
+        tail -n100 $SERVER_LOG
+        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
+        exit 1
+    fi
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/utils/plot_perf.py b/utils/plot_perf.py
index 7dd12dc59..5b2909fe3 100644
--- a/utils/plot_perf.py
+++ b/utils/plot_perf.py
@@ -9,6 +9,7 @@
 hw_color = {
     'h100': 'lightgreen',
     'h200': 'green',
+    'h200-trt': 'darkgreen',
     'b200': 'black',
     'b200-trt': 'darkblue',
     'mi300x': 'pink',

From 25566a91ab2054d175e062f9b0529a416675a07c Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 20:32:19 -0700
Subject: [PATCH 17/28] fix launch slurm script based on runner label

---
 runners/launch_h200-nv.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh
index 4bedf9b71..e53d952d0 100644
--- a/runners/launch_h200-nv.sh
+++ b/runners/launch_h200-nv.sh
@@ -5,7 +5,7 @@ export HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/"
 export PORT_OFFSET=${USER: -1}
 
 PARTITION="dgx-h200"
-SQUASH_FILE="/raid/image_${MODEL_CODE}_h200.sqsh"
+SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
 JOB_ID=$(squeue -u $USER -h -o %A)
@@ -18,6 +18,6 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/${MODEL_CODE}_h200_slurm.sh
+bash benchmarks/${MODEL_CODE}_${RUNNER_LABEL}_slurm.sh
 
 scancel $JOB_ID

From d33cda5d7c7095d07c8d396b57873ca709b2d577 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 20:46:15 -0700
Subject: [PATCH 18/28] better identify if result is vllm or trt

---
 utils/summarize.py | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/utils/summarize.py b/utils/summarize.py
index 245c77de5..6c6f9dc43 100644
--- a/utils/summarize.py
+++ b/utils/summarize.py
@@ -18,10 +18,19 @@
 print(summary_header)
 
 for result in results:
-    # Extract framework from experiment name or runner
-    framework = 'vLLM'  # default
-    if 'trt' in result.get('exp_name', '').lower() or 'trt' in result.get('runner', '').lower():
-        framework = 'TRT-LLM'
+    # Extract framework - prefer explicit framework field, fallback to detection
+    framework = result.get('framework', 'vLLM')  # default to vLLM if not specified
+    
+    # If no explicit framework field, try to detect from other fields
+    if framework == 'vLLM':
+        exp_name = result.get('exp_name', '')
+        runner = result.get('runner', '')
+        
+        # Check for TRT-LLM indicators
+        if ('trt' in exp_name.lower() or 'trt' in runner.lower() or 
+            'trt-llm' in exp_name.lower() or 'trt-llm' in runner.lower() or
+            'tensorrt' in exp_name.lower() or 'tensorrt' in runner.lower()):
+            framework = 'TRT-LLM'
     
     print(
         f"| {result['hw'].upper()} "

From de2d8de25e2291400da52706583d8941b7d57558 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 21:26:01 -0700
Subject: [PATCH 19/28] clarify runners for trt and vllm

---
 .github/workflows/70b-tmpl.yml           |  4 +-
 .github/workflows/70b-trt-tmpl.yml       |  4 +-
 .github/workflows/cluster-cleanup.yml    |  5 ++
 .github/workflows/workflow-scheduler.yml | 62 ++++++++++++------------
 4 files changed, 40 insertions(+), 35 deletions(-)

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index 23ad88551..ff70adcca 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -59,7 +59,7 @@ jobs:
       runner: h200
       image: 'kedarpotdar147/vllm0.1:latest'
       model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
+      tp-list: '[2]'
       timeout: ${{ inputs.timeout }}
 
   bmk-b200:
@@ -75,7 +75,7 @@ jobs:
       runner: b200
       image: 'kedarpotdar147/vllm0.1:latest'
       model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
+      tp-list: '[2]'
       timeout: ${{ inputs.timeout }}
 
   bmk-mi300x:
diff --git a/.github/workflows/70b-trt-tmpl.yml b/.github/workflows/70b-trt-tmpl.yml
index 8c7184443..61e5c77d9 100644
--- a/.github/workflows/70b-trt-tmpl.yml
+++ b/.github/workflows/70b-trt-tmpl.yml
@@ -43,7 +43,7 @@ jobs:
       runner: b200-trt
       image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc0'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
+      tp-list: '[2]'
       timeout: ${{ inputs.timeout }}
 
   bmk-h200-trt:
@@ -59,7 +59,7 @@ jobs:
       runner: h200-trt
       image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc0'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      tp-list: '[2, 4, 8]'
+      tp-list: '[2]'
       timeout: ${{ inputs.timeout }}
 
   collect-results:
diff --git a/.github/workflows/cluster-cleanup.yml b/.github/workflows/cluster-cleanup.yml
index a74311d9f..3ecf9763d 100644
--- a/.github/workflows/cluster-cleanup.yml
+++ b/.github/workflows/cluster-cleanup.yml
@@ -23,8 +23,13 @@ jobs:
           - 'h200-nv_1'
           - 'h200-nv_2'
           - 'h200-nv_3'
+          - 'h200-trt_0'
+          - 'h200-trt_1'
+          - 'h200-trt_2'
           - 'b200-nv_0'
           - 'b200-nv_1'
+          - 'b200-trt_0'
+          - 'b200-trt_1'
           - 'mi325x-tw_0'
           - 'mi325x-tw_1'
           - 'mi325x-tw_2'
diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
index 7a631376c..2acf7f741 100644
--- a/.github/workflows/workflow-scheduler.yml
+++ b/.github/workflows/workflow-scheduler.yml
@@ -13,16 +13,16 @@ jobs:
   cleanup:
     uses: ./.github/workflows/cluster-cleanup.yml
 
-  # _70b-1k1k:
-  #   needs: cleanup
-  #   uses: ./.github/workflows/70b-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: '70b_1k1k'
-  #     isl: 1024
-  #     osl: 1024
-  #     max-model-len: 2048
-  #     random-range-ratio: 0.8
+  _70b-1k1k:
+    needs: cleanup
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_1k1k'
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
 
   _70b-trt-1k1k:
     needs: cleanup
@@ -57,16 +57,16 @@ jobs:
   #     max-model-len: 9216
   #     random-range-ratio: 0.8
 
-  _70b-trt-8k1k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-trt-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b-trt_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # _70b-trt-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-trt-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b-trt_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
   
   # dsr1-8k1k:
   #   needs: cleanup
@@ -91,17 +91,17 @@ jobs:
   #     random-range-ratio: 0.8
   #     timeout: 240
 
-  _70b-trt-1k8k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-trt-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b-trt_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      timeout: 240
+  # _70b-trt-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-trt-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b-trt_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+  #     timeout: 240
 
   # dsr1-1k8k:
   #   needs: cleanup

From 80dc11defa2acc3633431ed38043fe470250ab28 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 22:01:27 -0700
Subject: [PATCH 20/28] fix runner names

---
 .github/workflows/70b-trt-tmpl.yml                            | 4 ++--
 .../{70b-h200-trt_slurm.sh => 70b-trt_b200-nv_slurm.sh}       | 0
 .../{70b-trt_b200-trt_slurm.sh => 70b-trt_h200-nv_slurm.sh}   | 0
 3 files changed, 2 insertions(+), 2 deletions(-)
 rename benchmarks/{70b-h200-trt_slurm.sh => 70b-trt_b200-nv_slurm.sh} (100%)
 rename benchmarks/{70b-trt_b200-trt_slurm.sh => 70b-trt_h200-nv_slurm.sh} (100%)

diff --git a/.github/workflows/70b-trt-tmpl.yml b/.github/workflows/70b-trt-tmpl.yml
index 61e5c77d9..36b06e513 100644
--- a/.github/workflows/70b-trt-tmpl.yml
+++ b/.github/workflows/70b-trt-tmpl.yml
@@ -40,7 +40,7 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: b200-trt
+      runner: b200-nv
       image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc0'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
       tp-list: '[2]'
@@ -56,7 +56,7 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: h200-trt
+      runner: h200-nv
       image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc0'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
       tp-list: '[2]'
diff --git a/benchmarks/70b-h200-trt_slurm.sh b/benchmarks/70b-trt_b200-nv_slurm.sh
similarity index 100%
rename from benchmarks/70b-h200-trt_slurm.sh
rename to benchmarks/70b-trt_b200-nv_slurm.sh
diff --git a/benchmarks/70b-trt_b200-trt_slurm.sh b/benchmarks/70b-trt_h200-nv_slurm.sh
similarity index 100%
rename from benchmarks/70b-trt_b200-trt_slurm.sh
rename to benchmarks/70b-trt_h200-nv_slurm.sh

From 3cf357bfac4ec438eb52ea63726b29a53eed1d66 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 22:06:45 -0700
Subject: [PATCH 21/28] remove trt runners

---
 .github/workflows/cluster-cleanup.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/cluster-cleanup.yml b/.github/workflows/cluster-cleanup.yml
index 3ecf9763d..a74311d9f 100644
--- a/.github/workflows/cluster-cleanup.yml
+++ b/.github/workflows/cluster-cleanup.yml
@@ -23,13 +23,8 @@ jobs:
           - 'h200-nv_1'
           - 'h200-nv_2'
           - 'h200-nv_3'
-          - 'h200-trt_0'
-          - 'h200-trt_1'
-          - 'h200-trt_2'
           - 'b200-nv_0'
           - 'b200-nv_1'
-          - 'b200-trt_0'
-          - 'b200-trt_1'
           - 'mi325x-tw_0'
           - 'mi325x-tw_1'
           - 'mi325x-tw_2'

From 9d7cbd3c3c4a89ba2ecea25b52f9396371556da8 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 23:00:58 -0700
Subject: [PATCH 22/28] ensure trt runners are correctly tagged

---
 .github/workflows/70b-trt-tmpl.yml       |  4 ++--
 .github/workflows/workflow-scheduler.yml | 20 ++++++++++----------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/70b-trt-tmpl.yml b/.github/workflows/70b-trt-tmpl.yml
index 36b06e513..b66fdf2b7 100644
--- a/.github/workflows/70b-trt-tmpl.yml
+++ b/.github/workflows/70b-trt-tmpl.yml
@@ -40,7 +40,7 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: b200-nv
+      runner: b200
       image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc0'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
       tp-list: '[2]'
@@ -56,7 +56,7 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: h200-nv
+      runner: h200
       image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc0'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
       tp-list: '[2]'
diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
index 2acf7f741..c65335569 100644
--- a/.github/workflows/workflow-scheduler.yml
+++ b/.github/workflows/workflow-scheduler.yml
@@ -13,16 +13,16 @@ jobs:
   cleanup:
     uses: ./.github/workflows/cluster-cleanup.yml
 
-  _70b-1k1k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
+  # _70b-1k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_1k1k'
+  #     isl: 1024
+  #     osl: 1024
+  #     max-model-len: 2048
+  #     random-range-ratio: 0.8
 
   _70b-trt-1k1k:
     needs: cleanup

From a2ed19c37d8a06ad0196339305ac644e3bd4fbf6 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 23:04:03 -0700
Subject: [PATCH 23/28] rename launch scripts

---
 benchmarks/{70b-trt_b200-nv_slurm.sh => 70b-trt_b200_slurm.sh} | 0
 benchmarks/{70b-trt_h200-nv_slurm.sh => 70b-trt_h200_slurm.sh} | 0
 2 files changed, 0 insertions(+), 0 deletions(-)
 rename benchmarks/{70b-trt_b200-nv_slurm.sh => 70b-trt_b200_slurm.sh} (100%)
 rename benchmarks/{70b-trt_h200-nv_slurm.sh => 70b-trt_h200_slurm.sh} (100%)

diff --git a/benchmarks/70b-trt_b200-nv_slurm.sh b/benchmarks/70b-trt_b200_slurm.sh
similarity index 100%
rename from benchmarks/70b-trt_b200-nv_slurm.sh
rename to benchmarks/70b-trt_b200_slurm.sh
diff --git a/benchmarks/70b-trt_h200-nv_slurm.sh b/benchmarks/70b-trt_h200_slurm.sh
similarity index 100%
rename from benchmarks/70b-trt_h200-nv_slurm.sh
rename to benchmarks/70b-trt_h200_slurm.sh

From fd1ff2e45bd850fa0f3f28def31f82d6583e3715 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 23:35:18 -0700
Subject: [PATCH 24/28] only get latest run id

---
 runners/launch_b200-nv.sh | 2 +-
 runners/launch_h200-nv.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
index 576b4f660..54a9d97a9 100644
--- a/runners/launch_b200-nv.sh
+++ b/runners/launch_b200-nv.sh
@@ -8,7 +8,7 @@ PARTITION="dgx-b200"
 SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
-JOB_ID=$(squeue -u $USER -h -o %A)
+JOB_ID=$(squeue -u $USER -h -o %A | tail -1)
 
 set -x
 srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh
index e53d952d0..52256221a 100644
--- a/runners/launch_h200-nv.sh
+++ b/runners/launch_h200-nv.sh
@@ -8,7 +8,7 @@ PARTITION="dgx-h200"
 SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
-JOB_ID=$(squeue -u $USER -h -o %A)
+JOB_ID=$(squeue -u $USER -h -o %A | tail -1)
 
 set -x
 srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"

From 63d11bf5d6515900b9cf0a6c34caac7406e9b771 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 23:51:52 -0700
Subject: [PATCH 25/28] update trtllm image version

---
 .github/workflows/70b-trt-tmpl.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/70b-trt-tmpl.yml b/.github/workflows/70b-trt-tmpl.yml
index b66fdf2b7..40cf27f38 100644
--- a/.github/workflows/70b-trt-tmpl.yml
+++ b/.github/workflows/70b-trt-tmpl.yml
@@ -41,7 +41,7 @@ jobs:
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
       runner: b200
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc0'
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
       tp-list: '[2]'
       timeout: ${{ inputs.timeout }}
@@ -57,7 +57,7 @@ jobs:
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
       runner: h200
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc0'
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1'
       model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
       tp-list: '[2]'
       timeout: ${{ inputs.timeout }}

From 85a6e51b95032f226074756c9b41bdee8de423f4 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Thu, 28 Aug 2025 23:55:34 -0700
Subject: [PATCH 26/28] img ids

---
 runners/launch_b200-nv.sh | 2 +-
 runners/launch_h100-cw.sh | 2 +-
 runners/launch_h200-cw.sh | 4 ++--
 runners/launch_h200-nb.sh | 2 +-
 runners/launch_h200-nv.sh | 2 +-
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
index 54a9d97a9..21ec5c35e 100644
--- a/runners/launch_b200-nv.sh
+++ b/runners/launch_b200-nv.sh
@@ -5,7 +5,7 @@ export PORT_OFFSET=${USER: -1}
 
 MODEL_CODE="${1%%_*}"
 PARTITION="dgx-b200"
-SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}.sqsh"
+SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}-2.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
 JOB_ID=$(squeue -u $USER -h -o %A | tail -1)
diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh
index 570790e0b..f39c2f8b0 100644
--- a/runners/launch_h100-cw.sh
+++ b/runners/launch_h100-cw.sh
@@ -5,7 +5,7 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/hf_hub_cache/"
 export PORT_OFFSET=${USER: -1}
 
 PARTITION="h100"
-SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h100.sqsh"
+SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h100-2.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
 JOB_ID=$(squeue -u $USER -h -o %A)
diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh
index 3245cb379..1329fd4f7 100644
--- a/runners/launch_h200-cw.sh
+++ b/runners/launch_h200-cw.sh
@@ -5,10 +5,10 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/hf_hub_cache/"
 export PORT_OFFSET=${USER: -1}
 
 PARTITION="h200"
-SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h200.sqsh"
+SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h200-2.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
-JOB_ID=$(squeue -u $USER -h -o %A)
+JOB_ID=$(squeue -u $USER -h -o %A | tail -1)
 
 set -x
 srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh
index 028cf8033..7d4dbd2df 100644
--- a/runners/launch_h200-nb.sh
+++ b/runners/launch_h200-nb.sh
@@ -5,7 +5,7 @@ export HF_HUB_CACHE_MOUNT="/home/hf_hub_cache/"
 export PORT_OFFSET=${USER: -1}
 
 PARTITION="main"
-SQUASH_FILE="/home/squash/image_${MODEL_CODE}_h200.sqsh"
+SQUASH_FILE="/home/squash/image_${MODEL_CODE}_h200-2.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
 JOB_ID=$(squeue -u $USER -h -o %A)
diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh
index 52256221a..b5b2d7df5 100644
--- a/runners/launch_h200-nv.sh
+++ b/runners/launch_h200-nv.sh
@@ -5,7 +5,7 @@ export HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/"
 export PORT_OFFSET=${USER: -1}
 
 PARTITION="dgx-h200"
-SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}.sqsh"
+SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}-2.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
 JOB_ID=$(squeue -u $USER -h -o %A | tail -1)

From 6c8af514fefb8aef6afc9349ca885f2f3f62af16 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Fri, 29 Aug 2025 00:16:53 -0700
Subject: [PATCH 27/28] add fw identifier to benchmark template

---
 .github/workflows/benchmark-tmpl.yml | 12 +++++++++++-
 utils/process_result.py              | 16 ++++++++++++++--
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 78bd69c10..4bb8213c3 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -80,7 +80,17 @@ jobs:
       - name: Process result
         run: |
           RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}
-          python3 utils/process_result.py ${{ inputs.runner }} ${{ env.TP }} $RESULT_FILENAME
+          # Determine framework based on image
+          if [[ "${{ inputs.image }}" == *"tensorrt-llm"* ]]; then
+            FRAMEWORK="TRT-LLM"
+          elif [[ "${{ inputs.image }}" == *"vllm"* ]]; then
+            FRAMEWORK="vLLM"
+          elif [[ "${{ inputs.image }}" == *"sglang"* ]]; then
+            FRAMEWORK="SGLang"
+          else
+            FRAMEWORK="${{ inputs.runner }}"
+          fi
+          python3 utils/process_result.py $FRAMEWORK ${{ env.TP }} $RESULT_FILENAME
 
       - name: Upload result
         uses: actions/upload-artifact@v4
diff --git a/utils/process_result.py b/utils/process_result.py
index d0f0ef000..e7b697361 100644
--- a/utils/process_result.py
+++ b/utils/process_result.py
@@ -3,15 +3,27 @@
 from pathlib import Path
 
 
-hw = sys.argv[1]
+framework = sys.argv[1]  # First arg is the framework (TRT-LLM, vLLM, SGLang, etc.)
 tp_size = int(sys.argv[2])
 result_filename = sys.argv[3]
 
 with open(f'{result_filename}.json') as f:
     bmk_result = json.load(f)
 
+# Extract hardware from result filename or runner name
+# Result filename format: {exp-name}_tp{tp}_conc{conc}_{runner}
+# We need to extract the hardware type from the runner
+result_parts = result_filename.split('_')
+if len(result_parts) >= 4:
+    runner_part = result_parts[-1]  # Last part is the runner
+    # Extract hardware type (e.g., 'b200' from 'b200-nv_0')
+    hw = runner_part.split('-')[0].upper()  # Convert to uppercase for consistency
+else:
+    hw = "UNKNOWN"
+
 data = {
-    'hw': hw,
+    'hw': hw,           # Hardware (B200, H200, etc.)
+    'framework': framework,  # Framework (TRT-LLM, vLLM, SGLang, etc.)
     'tp': tp_size,
     'conc': int(bmk_result['max_concurrency']),
     'model': bmk_result['model_id'],

From 9946fb83526f734b1e89a8b571e264bf1c899331 Mon Sep 17 00:00:00 2001
From: Kedar Potdar <kepotdar@nvidia.com>
Date: Fri, 29 Aug 2025 00:17:48 -0700
Subject: [PATCH 28/28] limit concurrency for now

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 4bb8213c3..c1e89d80a 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -54,7 +54,7 @@ jobs:
       fail-fast: false
       matrix:
         tp: ${{ fromJson(inputs.tp-list) }}
-        conc: [4, 8, 16, 32, 64]
+        conc: [4]
     name: '${{ inputs.runner }} (tp${{ matrix.tp }} , conc${{ matrix.conc }})'
 
     env: