diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index 23ad88551..e889a364b 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -30,37 +30,37 @@ jobs:
       - name: Find the latest Docker image
         run: echo "Hardcoding image tags for now."
 
-  bmk-h100:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: h100
-      image: 'kedarpotdar147/vllm0.1:latest'
-      model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
-      tp-list: '[2, 4, 8]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-h100:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: h100
+  #     image: 'kedarpotdar147/vllm0.1:latest'
+  #     model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
+  #     tp-list: '[2]'
+  #     timeout: ${{ inputs.timeout }}
 
-  bmk-h200:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: h200
-      image: 'kedarpotdar147/vllm0.1:latest'
-      model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-h200:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: h200
+  #     image: 'kedarpotdar147/vllm0.1:latest'
+  #     model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
+  #     tp-list: '[2]'
+  #     timeout: ${{ inputs.timeout }}
 
   bmk-b200:
     needs: find-latest-image
@@ -75,43 +75,96 @@ jobs:
       runner: b200
       image: 'kedarpotdar147/vllm0.1:latest'
       model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
+      tp-list: '[2]'
       timeout: ${{ inputs.timeout }}
 
-  bmk-mi300x:
+  # bmk-mi300x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi300x
+  #     image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
+  #     model: 'amd/Llama-3.1-70B-Instruct-FP8-KV'
+  #     tp-list: '[1, 2, 4, 8]'
+  #     timeout: ${{ inputs.timeout }}
+
+  # bmk-mi325x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi325x
+  #     image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
+  #     model: 'amd/Llama-3.1-70B-Instruct-FP8-KV'
+  #     tp-list: '[1, 2, 4, 8]'
+  #     timeout: ${{ inputs.timeout }}
+
+  # TRT-LLM jobs
+  # bmk-b200-trt:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 70b-trt
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: b200
+  #     image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1'
+  #     model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+  #     tp-list: '[2]'
+  #     precision: 'fp8'
+  #     timeout: ${{ inputs.timeout }}
+
+  bmk-h200-trt:
     needs: find-latest-image
     uses: ./.github/workflows/benchmark-tmpl.yml
     secrets: inherit
     with:
-      exp-name: ${{ inputs.exp-name }}
+      exp-name: 70b-trt
       isl: ${{ inputs.isl }}
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi300x
-      image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
-      model: 'amd/Llama-3.1-70B-Instruct-FP8-KV'
-      tp-list: '[1, 2, 4, 8]'
+      runner: h200
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      tp-list: '[2]'
+      precision: 'fp8'
       timeout: ${{ inputs.timeout }}
 
-  bmk-mi325x:
+  bmk-b200-trt-fp4:
     needs: find-latest-image
     uses: ./.github/workflows/benchmark-tmpl.yml
     secrets: inherit
     with:
-      exp-name: ${{ inputs.exp-name }}
+      exp-name: 70b-trt
       isl: ${{ inputs.isl }}
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi325x
-      image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
-      model: 'amd/Llama-3.1-70B-Instruct-FP8-KV'
-      tp-list: '[1, 2, 4, 8]'
+      runner: b200
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP4'
+      tp-list: '[2]'
+      precision: 'fp4'
       timeout: ${{ inputs.timeout }}
 
+
   collect-results:
-    needs: [bmk-h100, bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x]
+    needs: [bmk-b200,  bmk-h200-trt, bmk-b200-trt-fp4]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 7e4e0b708..98f2543b4 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -29,6 +29,10 @@ on:
       tp-list:
         required: true
         type: string
+      precision:
+        required: false
+        type: string
+        default: 'fp8'
       timeout:
         required: true
         type: number
@@ -43,6 +47,8 @@ env:
   MAX_MODEL_LEN: ${{ inputs.max-model-len }}
   RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }}
   IMAGE: ${{ inputs.image }}
+  RUNNER_LABEL: ${{ inputs.runner }}
+  PRECISION: ${{ inputs.precision }}
 
 jobs:
   benchmark:
@@ -53,7 +59,7 @@ jobs:
       fail-fast: false
       matrix:
         tp: ${{ fromJson(inputs.tp-list) }}
-        conc: [4, 8, 16, 32, 64]
+        conc: [4]
     name: '${{ inputs.runner }} (tp${{ matrix.tp }} , conc${{ matrix.conc }})'
 
     env:
@@ -68,7 +74,7 @@ jobs:
 
       - name: Set result filename
         run: |
-          RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }}
+          RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}
           echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
 
       - name: Launch job script
@@ -77,10 +83,22 @@ jobs:
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh ${{ inputs.exp-name }}
 
       - name: Process result
-        run: python3 utils/process_result.py ${{ inputs.runner }} ${{ env.TP }} ${{ env.RESULT_FILENAME }}
+        run: |
+          RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}
+          # Determine framework based on image
+          if [[ "${{ inputs.image }}" == *"tensorrt-llm"* ]]; then
+            FRAMEWORK="TRT-LLM"
+          elif [[ "${{ inputs.image }}" == *"vllm"* ]]; then
+            FRAMEWORK="vLLM"
+          elif [[ "${{ inputs.image }}" == *"sglang"* ]]; then
+            FRAMEWORK="SGLang"
+          else
+            FRAMEWORK="${{ inputs.runner }}"
+          fi
+          python3 utils/process_result.py $FRAMEWORK ${{ env.TP }} $RESULT_FILENAME ${{ env.PRECISION }}
 
       - name: Upload result
         uses: actions/upload-artifact@v4
         with:
-          name: ${{ env.RESULT_FILENAME }}
-          path: agg_${{ env.RESULT_FILENAME }}.json
+          name: ${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ env.PRECISION }}_${{ runner.name }}
+          path: agg_${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}.json
diff --git a/.github/workflows/cluster-cleanup.yml b/.github/workflows/cluster-cleanup.yml
index e0f30ae17..373794a69 100644
--- a/.github/workflows/cluster-cleanup.yml
+++ b/.github/workflows/cluster-cleanup.yml
@@ -24,7 +24,7 @@ jobs:
           - 'h200-nv_2'
           - 'h200-nv_3'
           - 'b200-nv_0'
-          - 'b200-nv_1'
+          - 'b200-nv_1' 
           - 'mi325x-tw_0'
           - 'mi325x-tw_1'
           - 'mi325x-tw_2'
@@ -47,7 +47,7 @@ jobs:
         runner:
           - 'h100-cr_0'
           - 'h100-cr_1'
-          - 'b200-tg_0'
+          # - 'b200-tg_0'
           - 'mi300x-cr_0'
           - 'mi300x-amd_0'
           - 'mi300x-amd_1'
diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml
index 8924facb0..c98715e4d 100644
--- a/.github/workflows/collect-results.yml
+++ b/.github/workflows/collect-results.yml
@@ -22,7 +22,13 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           path: results/
-          pattern: ${{ inputs.exp-name }}_*
+          pattern: ${{ inputs.exp-name }}*
+      
+      - name: Download TRT artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: results/
+          pattern: 70b-trt*
 
       - name: Print summary
         run: python3 utils/summarize.py results/ ${{ inputs.exp-name }} >> $GITHUB_STEP_SUMMARY
diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
index ce03740fc..de673c2a5 100644
--- a/.github/workflows/workflow-scheduler.yml
+++ b/.github/workflows/workflow-scheduler.yml
@@ -23,59 +23,61 @@ jobs:
       osl: 1024
       max-model-len: 2048
       random-range-ratio: 0.8
-  
-  dsr1-1k1k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
 
-  _70b-8k1k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+
+  # dsr1-1k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_1k1k'
+  #     isl: 1024
+  #     osl: 1024
+  #     max-model-len: 2048
+  #     random-range-ratio: 0.8
+
+  # _70b-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+
   
-  dsr1-8k1k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # dsr1-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
 
-  _70b-1k8k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      timeout: 240
+  # _70b-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+  #     timeout: 240
 
-  dsr1-1k8k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # dsr1-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
diff --git a/benchmarks/70b-trt_b200_slurm.sh b/benchmarks/70b-trt_b200_slurm.sh
new file mode 100644
index 000000000..5f91bb2e2
--- /dev/null
+++ b/benchmarks/70b-trt_b200_slurm.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+set -x
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+# Create llama-config.yml inline
+cat > llama-config.yml << 'EOF'
+enable_attention_dp: false 
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 4
+EOF
+
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
+        sleep 5
+        tail -n100 $SERVER_LOG
+        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
+        exit 1
+    fi
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b-trt_h200_slurm.sh b/benchmarks/70b-trt_h200_slurm.sh
new file mode 100644
index 000000000..5f91bb2e2
--- /dev/null
+++ b/benchmarks/70b-trt_h200_slurm.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+set -x
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+# Create llama-config.yml inline
+cat > llama-config.yml << 'EOF'
+enable_attention_dp: false 
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 4
+EOF
+
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
+        sleep 5
+        tail -n100 $SERVER_LOG
+        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
+        exit 1
+    fi
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_b200_docker.sh b/benchmarks/70b_b200_docker.sh
index 27e20c770..da933f4cf 100644
--- a/benchmarks/70b_b200_docker.sh
+++ b/benchmarks/70b_b200_docker.sh
@@ -29,6 +29,9 @@ port=8888
 docker network create $network_name
 
 set -x
+
+pip uninstall -y nvidia-nccl-cu12
+pip install nvidia-nccl-cu12==2.26.2.post1
 docker run --rm -d --network $network_name --name $server_name \
 --runtime nvidia --gpus all --ipc host --privileged --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 \
 -v $HF_HOME_DIR/hf_hub_cache/:$HF_HUB_CACHE \
diff --git a/benchmarks/70b_b200_slurm.sh b/benchmarks/70b_b200_slurm.sh
index fd444abab..9a0ac2558 100644
--- a/benchmarks/70b_b200_slurm.sh
+++ b/benchmarks/70b_b200_slurm.sh
@@ -21,6 +21,9 @@ hf download $MODEL
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 
+pip uninstall -y nvidia-nccl-cu12
+pip install nvidia-nccl-cu12==2.26.2.post1
+
 export TORCH_CUDA_ARCH_LIST="10.0"
 vllm serve $MODEL --host 0.0.0.0 --port $PORT \
 --trust-remote-code --quantization modelopt --kv-cache-dtype fp8 --gpu-memory-utilization 0.9 \
diff --git a/runners/launch_b200-nv.sh b/runners/launch_b200-nv.sh
index 83f1ec801..21ec5c35e 100644
--- a/runners/launch_b200-nv.sh
+++ b/runners/launch_b200-nv.sh
@@ -5,10 +5,10 @@ export PORT_OFFSET=${USER: -1}
 
 MODEL_CODE="${1%%_*}"
 PARTITION="dgx-b200"
-SQUASH_FILE="/raid/image_${MODEL_CODE}_b200.sqsh"
+SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}-2.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
-JOB_ID=$(squeue -u $USER -h -o %A)
+JOB_ID=$(squeue -u $USER -h -o %A | tail -1)
 
 set -x
 srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
@@ -18,6 +18,6 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/${MODEL_CODE}_b200_slurm.sh
+bash benchmarks/${MODEL_CODE}_${RUNNER_LABEL}_slurm.sh
 
 scancel $JOB_ID
diff --git a/runners/launch_h100-cw.sh b/runners/launch_h100-cw.sh
index 570790e0b..f39c2f8b0 100644
--- a/runners/launch_h100-cw.sh
+++ b/runners/launch_h100-cw.sh
@@ -5,7 +5,7 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/hf_hub_cache/"
 export PORT_OFFSET=${USER: -1}
 
 PARTITION="h100"
-SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h100.sqsh"
+SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h100-2.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
 JOB_ID=$(squeue -u $USER -h -o %A)
diff --git a/runners/launch_h200-cw.sh b/runners/launch_h200-cw.sh
index 3245cb379..1329fd4f7 100644
--- a/runners/launch_h200-cw.sh
+++ b/runners/launch_h200-cw.sh
@@ -5,10 +5,10 @@ export HF_HUB_CACHE_MOUNT="/mnt/vast/hf_hub_cache/"
 export PORT_OFFSET=${USER: -1}
 
 PARTITION="h200"
-SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h200.sqsh"
+SQUASH_FILE="/mnt/vast/squash/image_${MODEL_CODE}_h200-2.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
-JOB_ID=$(squeue -u $USER -h -o %A)
+JOB_ID=$(squeue -u $USER -h -o %A | tail -1)
 
 set -x
 srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
diff --git a/runners/launch_h200-nb.sh b/runners/launch_h200-nb.sh
index 028cf8033..7d4dbd2df 100644
--- a/runners/launch_h200-nb.sh
+++ b/runners/launch_h200-nb.sh
@@ -5,7 +5,7 @@ export HF_HUB_CACHE_MOUNT="/home/hf_hub_cache/"
 export PORT_OFFSET=${USER: -1}
 
 PARTITION="main"
-SQUASH_FILE="/home/squash/image_${MODEL_CODE}_h200.sqsh"
+SQUASH_FILE="/home/squash/image_${MODEL_CODE}_h200-2.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
 JOB_ID=$(squeue -u $USER -h -o %A)
diff --git a/runners/launch_h200-nv.sh b/runners/launch_h200-nv.sh
index 4bedf9b71..b5b2d7df5 100644
--- a/runners/launch_h200-nv.sh
+++ b/runners/launch_h200-nv.sh
@@ -5,10 +5,10 @@ export HF_HUB_CACHE_MOUNT="/raid/hf_hub_cache/"
 export PORT_OFFSET=${USER: -1}
 
 PARTITION="dgx-h200"
-SQUASH_FILE="/raid/image_${MODEL_CODE}_h200.sqsh"
+SQUASH_FILE="/raid/image_${MODEL_CODE}_${RUNNER_LABEL}-2.sqsh"
 
 salloc --partition=$PARTITION --gres=gpu:$TP --exclusive --time=180 --no-shell
-JOB_ID=$(squeue -u $USER -h -o %A)
+JOB_ID=$(squeue -u $USER -h -o %A | tail -1)
 
 set -x
 srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
@@ -18,6 +18,6 @@ srun --jobid=$JOB_ID \
 --container-mount-home \
 --container-workdir=/workspace/ \
 --no-container-entrypoint --export=ALL \
-bash benchmarks/${MODEL_CODE}_h200_slurm.sh
+bash benchmarks/${MODEL_CODE}_${RUNNER_LABEL}_slurm.sh
 
 scancel $JOB_ID
diff --git a/utils/plot_perf.py b/utils/plot_perf.py
index 35eb46eb2..a7811ea0e 100644
--- a/utils/plot_perf.py
+++ b/utils/plot_perf.py
@@ -8,7 +8,7 @@
 exp_name = sys.argv[2]
 hw_color = {
     'h100': 'lightgreen',
-    'h200': 'green',
+    'h200': 'darkgreen',
     'b200': 'black',
     'mi300x': 'pink',
     'mi325x': 'red',
@@ -25,15 +25,25 @@
 def plot_tput_vs_e2el():
     fig, ax = plt.subplots()
 
-    for hw, color in hw_color.items():
-        xs = [result['median_e2el'] for result in results if result['hw'] == hw]
-        ys = [result['tput_per_gpu'] for result in results if result['hw'] == hw]
-        if xs and ys:
-            ax.scatter(xs, ys, label=hw.upper(), color=color)
+    # Group by hardware, framework, and precision
+    for hw in set(result['hw'] for result in results):
+        for framework in set(result.get('framework', 'vLLM') for result in results if result['hw'] == hw):
+            for precision in set(result.get('precision', 'fp8') for result in results if result['hw'] == hw and result.get('framework', 'vLLM') == framework):
+                xs = [result.get('e2el', result.get('median_e2el', 0)) for result in results if result['hw'] == hw and result.get('framework', 'vLLM') == framework and result.get('precision', 'fp8') == precision]
+                ys = [result['tput_per_gpu'] for result in results if result['hw'] == hw and result.get('framework', 'vLLM') == framework and result.get('precision', 'fp8') == precision]
+                if xs and ys:
+                    # Only add framework label for TRT-LLM, keep vLLM simple
+                    if framework == 'TRT-LLM':
+                        label = f"{hw.upper()}-TRT-{precision.upper()}"
+                    else:
+                        label = f"{hw.upper()}-{precision.upper()}"
+                    color = hw_color.get(hw.lower(), 'blue')
+                    ax.scatter(xs, ys, label=label, color=color, alpha=0.7)
 
     for result in results:
-        x, y = result['median_e2el'], result['tput_per_gpu']
-        ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8)
+        x = result.get('e2el', result.get('median_e2el', 0))
+        y = result['tput_per_gpu']
+        ax.annotate(f"{result['tp']}-{result.get('precision', 'fp8').upper()}", (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8)
 
     ax.set_xlabel('End-to-end Latency (s)')
     ax.set_ylabel('Throughput per GPU (tok/s)')
@@ -47,15 +57,25 @@ def plot_tput_vs_e2el():
 def plot_tput_vs_intvty():
     fig, ax = plt.subplots()
 
-    for hw, color in hw_color.items():
-        xs = [result['median_intvty'] for result in results if result['hw'] == hw]
-        ys = [result['tput_per_gpu'] for result in results if result['hw'] == hw]
-        if xs and ys:
-            ax.scatter(xs, ys, label=hw.upper(), color=color)
+    # Group by hardware, framework, and precision
+    for hw in set(result['hw'] for result in results):
+        for framework in set(result.get('framework', 'vLLM') for result in results if result['hw'] == hw):
+            for precision in set(result.get('precision', 'fp8') for result in results if result['hw'] == hw and result.get('framework', 'vLLM') == framework):
+                xs = [result.get('intvty', result.get('median_intvty', 0)) for result in results if result['hw'] == hw and result.get('framework', 'vLLM') == framework and result.get('precision', 'fp8') == precision]
+                ys = [result['tput_per_gpu'] for result in results if result['hw'] == hw and result.get('framework', 'vLLM') == framework and result.get('precision', 'fp8') == precision]
+                if xs and ys:
+                    # Only add framework label for TRT-LLM, keep vLLM simple
+                    if framework == 'TRT-LLM':
+                        label = f"{hw.upper()}-TRT-{precision.upper()}"
+                    else:
+                        label = f"{hw.upper()}-{precision.upper()}"
+                    color = hw_color.get(hw.lower(), 'blue')
+                    ax.scatter(xs, ys, label=label, color=color, alpha=0.7)
 
     for result in results:
-        x, y = result['median_intvty'], result['tput_per_gpu']
-        ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8)
+        x = result.get('intvty', result.get('median_intvty', 0))
+        y = result['tput_per_gpu']
+        ax.annotate(f"{result['tp']}-{result.get('precision', 'fp8').upper()}", (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8)
 
     ax.set_xlabel('Interactivity (tok/s/user)')
     ax.set_ylabel('Throughput per GPU (tok/s)')
diff --git a/utils/process_result.py b/utils/process_result.py
index d0f0ef000..76f1b8541 100644
--- a/utils/process_result.py
+++ b/utils/process_result.py
@@ -3,15 +3,29 @@
 from pathlib import Path
 
 
-hw = sys.argv[1]
+framework = sys.argv[1]  # First arg is the framework (TRT-LLM, vLLM, SGLang, etc.)
 tp_size = int(sys.argv[2])
 result_filename = sys.argv[3]
+precision = sys.argv[4] if len(sys.argv) > 4 else 'fp8'  # Fourth arg is precision, default to fp8
 
 with open(f'{result_filename}.json') as f:
     bmk_result = json.load(f)
 
+# Extract hardware from result filename or runner name
+# Result filename format: {exp-name}_tp{tp}_conc{conc}_{runner}
+# We need to extract the hardware type from the runner
+result_parts = result_filename.split('_')
+if len(result_parts) >= 4:
+    runner_part = result_parts[-1]  # Last part is the runner
+    # Extract hardware type (e.g., 'b200' from 'b200-nv_0')
+    hw = runner_part.split('-')[0].upper()  # Convert to uppercase for consistency
+else:
+    hw = "UNKNOWN"
+
 data = {
-    'hw': hw,
+    'hw': hw,           # Hardware (B200, H200, etc.)
+    'framework': framework,  # Framework (TRT-LLM, vLLM, SGLang, etc.)
+    'precision': precision,  # Precision (fp8, fp4, etc.)
     'tp': tp_size,
     'conc': int(bmk_result['max_concurrency']),
     'model': bmk_result['model_id'],
diff --git a/utils/summarize.py b/utils/summarize.py
index 20d9ae127..50ea6e07a 100644
--- a/utils/summarize.py
+++ b/utils/summarize.py
@@ -12,18 +12,42 @@
 results.sort(key=lambda r: (r['hw'], r['tp'], r['conc']))
 
 summary_header = f'''\
-| Hardware | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
-| :-: | :-: | :-: | :-: | :-: | :-: | :-: |\
+| Hardware | Framework | Precision | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
+| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\
 '''
 print(summary_header)
 
 for result in results:
+    # Extract framework - prefer explicit framework field, fallback to detection
+    framework = result.get('framework', 'vLLM')  # default to vLLM if not specified
+    
+    # If no explicit framework field, try to detect from other fields
+    if framework == 'vLLM':
+        exp_name = result.get('exp_name', '')
+        runner = result.get('runner', '')
+        
+        # Check for TRT-LLM indicators
+        if ('trt' in exp_name.lower() or 'trt' in runner.lower() or 
+            'trt-llm' in exp_name.lower() or 'trt-llm' in runner.lower() or
+            'tensorrt' in exp_name.lower() or 'tensorrt' in runner.lower()):
+            framework = 'TRT-LLM'
+    
+    # Get precision, default to 'fp8' if not present
+    precision = result.get('precision', 'fp8')
+    
+    # Get metrics with fallbacks for missing fields
+    ttft = result.get('ttft', result.get('median_ttft', 0))
+    tpot = result.get('tpot', result.get('median_tpot', 0))
+    e2el = result.get('e2el', result.get('median_e2el', 0))
+    
     print(
         f"| {result['hw'].upper()} "
+        f"| {framework} "
+        f"| {precision.upper()} "
         f"| {result['tp']} "
         f"| {result['conc']} "
-        f"| {(result['median_ttft'] * 1000):.4f} "
-        f"| {(result['median_tpot'] * 1000):.4f} "
-        f"| {result['median_e2el']:.4f} "
+        f"| {(ttft * 1000):.4f} "
+        f"| {(tpot * 1000):.4f} "
+        f"| {e2el:.4f} "
         f"| {result['tput_per_gpu']:.4f} |"
     )