SemiAnalysisAI · kedarpotdar-nv · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025
diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
@@ -59,7 +59,7 @@ jobs:
       runner: h200
       image: 'kedarpotdar147/vllm0.1:latest'
       model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
+      tp-list: '[2]'
       timeout: ${{ inputs.timeout }}
 
   bmk-b200:
@@ -75,7 +75,7 @@ jobs:
       runner: b200
       image: 'kedarpotdar147/vllm0.1:latest'
       model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
+      tp-list: '[2]'
       timeout: ${{ inputs.timeout }}
 
   bmk-mi300x:

diff --git a/.github/workflows/70b-trt-tmpl.yml b/.github/workflows/70b-trt-tmpl.yml
@@ -0,0 +1,71 @@
+name: LLaMA 70B TRT-LLM Template
+
+on:
+  workflow_call:
+    inputs:
+      exp-name:
+        required: true
+        type: string
+      isl:
+        required: true
+        type: string
+      osl:
+        required: true
+        type: string
+      max-model-len:
+        required: true
+        type: string
+      random-range-ratio:
+        required: true
+        type: string
+      timeout:
+        required: false
+        type: number
+        default: 180
+
+jobs:
+  find-latest-image:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Find the latest Docker image
+        run: echo "Hardcoding image tags for now."
+
+  bmk-b200-trt:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: b200
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      tp-list: '[2]'
+      timeout: ${{ inputs.timeout }}
+
+  bmk-h200-trt:
+    needs: find-latest-image
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      runner: h200
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      tp-list: '[2]'
+      timeout: ${{ inputs.timeout }}
+
+  collect-results:
+    needs: [bmk-b200-trt, bmk-h200-trt]
+    if: ${{ always() && !cancelled() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -43,6 +43,7 @@ env:
   MAX_MODEL_LEN: ${{ inputs.max-model-len }}
   RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }}
   IMAGE: ${{ inputs.image }}
+  RUNNER_LABEL: ${{ inputs.runner }}
 
 jobs:
   benchmark:
@@ -53,7 +54,7 @@ jobs:
       fail-fast: false
       matrix:
         tp: ${{ fromJson(inputs.tp-list) }}
-        conc: [4, 8, 16, 32, 64]
+        conc: [4]
     name: '${{ inputs.runner }} (tp${{ matrix.tp }} , conc${{ matrix.conc }})'
 
     env:
@@ -68,7 +69,7 @@ jobs:
 
       - name: Set result filename
         run: |
-          RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }}
+          RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}
           echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
 
       - name: Launch job script
@@ -77,10 +78,22 @@ jobs:
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh ${{ inputs.exp-name }}
 
       - name: Process result
-        run: python3 utils/process_result.py ${{ inputs.runner }} ${{ env.TP }} ${{ env.RESULT_FILENAME }}
+        run: |
+          RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}
+          # Determine framework based on image
+          if [[ "${{ inputs.image }}" == *"tensorrt-llm"* ]]; then
+            FRAMEWORK="TRT-LLM"
+          elif [[ "${{ inputs.image }}" == *"vllm"* ]]; then
+            FRAMEWORK="vLLM"
+          elif [[ "${{ inputs.image }}" == *"sglang"* ]]; then
+            FRAMEWORK="SGLang"
+          else
+            FRAMEWORK="${{ inputs.runner }}"
+          fi
+          python3 utils/process_result.py $FRAMEWORK ${{ env.TP }} $RESULT_FILENAME
 
       - name: Upload result
         uses: actions/upload-artifact@v4
         with:
-          name: ${{ env.RESULT_FILENAME }}
-          path: agg_${{ env.RESULT_FILENAME }}.json
+          name: ${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }}
+          path: agg_${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}.json
diff --git a/.github/workflows/cluster-cleanup.yml b/.github/workflows/cluster-cleanup.yml
@@ -47,7 +47,7 @@ jobs:
         runner:
           - 'h100-cr_0'
           - 'h100-cr_1'
-          - 'b200-tg_0'
+          # - 'b200-tg_0'
           - 'mi300x-cr_0'
           - 'mi300x-amd_0'
           - 'mi300x-amd_1'

diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
@@ -13,69 +13,103 @@ jobs:
   cleanup:
     uses: ./.github/workflows/cluster-cleanup.yml
 
-  _70b-1k1k:
+  # _70b-1k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_1k1k'
+  #     isl: 1024
+  #     osl: 1024
+  #     max-model-len: 2048
+  #     random-range-ratio: 0.8
+
+  _70b-trt-1k1k:
     needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
+    uses: ./.github/workflows/70b-trt-tmpl.yml
     secrets: inherit
     with:
-      exp-name: '70b_1k1k'
+      exp-name: '70b-trt_1k1k'
       isl: 1024
       osl: 1024
       max-model-len: 2048
       random-range-ratio: 0.8
 
-  dsr1-1k1k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
+  # dsr1-1k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_1k1k'
+  #     isl: 1024
+  #     osl: 1024
+  #     max-model-len: 2048
+  #     random-range-ratio: 0.8
 
-  _70b-8k1k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # _70b-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+
+  # _70b-trt-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-trt-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b-trt_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
 
-  dsr1-8k1k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # dsr1-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
 
-  _70b-1k8k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      timeout: 240
+  # _70b-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+  #     timeout: 240
 
-  dsr1-1k8k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # _70b-trt-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-trt-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b-trt_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+  #     timeout: 240
+
+  # dsr1-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
diff --git a/benchmarks/70b-trt_b200_slurm.sh b/benchmarks/70b-trt_b200_slurm.sh
@@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+set -x
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+# Create llama-config.yml inline
+cat > llama-config.yml << 'EOF'
+enable_attention_dp: false 
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 4
+EOF
+
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
+        sleep 5
+        tail -n100 $SERVER_LOG
+        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
+        exit 1
+    fi
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json