SemiAnalysisAI · kedarpotdar-nv · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025 · Aug 28, 2025
diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
@@ -30,37 +30,37 @@ jobs:
       - name: Find the latest Docker image
         run: echo "Hardcoding image tags for now."
 
-  bmk-h100:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: h100
-      image: 'kedarpotdar147/vllm0.1:latest'
-      model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
-      tp-list: '[2, 4, 8]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-h100:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: h100
+  #     image: 'kedarpotdar147/vllm0.1:latest'
+  #     model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
+  #     tp-list: '[2]'
+  #     timeout: ${{ inputs.timeout }}
 
-  bmk-h200:
-    needs: find-latest-image
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: h200
-      image: 'kedarpotdar147/vllm0.1:latest'
-      model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
-      timeout: ${{ inputs.timeout }}
+  # bmk-h200:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: h200
+  #     image: 'kedarpotdar147/vllm0.1:latest'
+  #     model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
+  #     tp-list: '[2]'
+  #     timeout: ${{ inputs.timeout }}
 
   bmk-b200:
     needs: find-latest-image
@@ -75,43 +75,96 @@ jobs:
       runner: b200
       image: 'kedarpotdar147/vllm0.1:latest'
       model: 'nvidia/Llama-3.1-70B-Instruct-FP8'
-      tp-list: '[1, 2, 4, 8]'
+      tp-list: '[2]'
       timeout: ${{ inputs.timeout }}
 
-  bmk-mi300x:
+  # bmk-mi300x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi300x
+  #     image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
+  #     model: 'amd/Llama-3.1-70B-Instruct-FP8-KV'
+  #     tp-list: '[1, 2, 4, 8]'
+  #     timeout: ${{ inputs.timeout }}
+
+  # bmk-mi325x:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: mi325x
+  #     image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
+  #     model: 'amd/Llama-3.1-70B-Instruct-FP8-KV'
+  #     tp-list: '[1, 2, 4, 8]'
+  #     timeout: ${{ inputs.timeout }}
+
+  # TRT-LLM jobs
+  # bmk-b200-trt:
+  #   needs: find-latest-image
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 70b-trt
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
+  #     runner: b200
+  #     image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1'
+  #     model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+  #     tp-list: '[2]'
+  #     precision: 'fp8'
+  #     timeout: ${{ inputs.timeout }}
+
+  bmk-h200-trt:
     needs: find-latest-image
     uses: ./.github/workflows/benchmark-tmpl.yml
     secrets: inherit
     with:
-      exp-name: ${{ inputs.exp-name }}
+      exp-name: 70b-trt
       isl: ${{ inputs.isl }}
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi300x
-      image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
-      model: 'amd/Llama-3.1-70B-Instruct-FP8-KV'
-      tp-list: '[1, 2, 4, 8]'
+      runner: h200
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      tp-list: '[2]'
+      precision: 'fp8'
       timeout: ${{ inputs.timeout }}
 
-  bmk-mi325x:
+  bmk-b200-trt-fp4:
     needs: find-latest-image
     uses: ./.github/workflows/benchmark-tmpl.yml
     secrets: inherit
     with:
-      exp-name: ${{ inputs.exp-name }}
+      exp-name: 70b-trt
       isl: ${{ inputs.isl }}
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi325x
-      image: 'rocm/vllm-dev:nightly_official_0729_rc1_20250718'
-      model: 'amd/Llama-3.1-70B-Instruct-FP8-KV'
-      tp-list: '[1, 2, 4, 8]'
+      runner: b200
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc1'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP4'
+      tp-list: '[2]'
+      precision: 'fp4'
       timeout: ${{ inputs.timeout }}
 
+
   collect-results:
-    needs: [bmk-h100, bmk-h200, bmk-b200, bmk-mi300x, bmk-mi325x]
+    needs: [bmk-b200,  bmk-h200-trt, bmk-b200-trt-fp4]
     if: ${{ always() && !cancelled() }}
     uses: ./.github/workflows/collect-results.yml
     secrets: inherit

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
@@ -29,6 +29,10 @@ on:
       tp-list:
         required: true
         type: string
+      precision:
+        required: false
+        type: string
+        default: 'fp8'
       timeout:
         required: true
         type: number
@@ -43,6 +47,8 @@ env:
   MAX_MODEL_LEN: ${{ inputs.max-model-len }}
   RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }}
   IMAGE: ${{ inputs.image }}
+  RUNNER_LABEL: ${{ inputs.runner }}
+  PRECISION: ${{ inputs.precision }}
 
 jobs:
   benchmark:
@@ -53,7 +59,7 @@ jobs:
       fail-fast: false
       matrix:
         tp: ${{ fromJson(inputs.tp-list) }}
-        conc: [4, 8, 16, 32, 64]
+        conc: [4]
     name: '${{ inputs.runner }} (tp${{ matrix.tp }} , conc${{ matrix.conc }})'
 
     env:
@@ -68,7 +74,7 @@ jobs:
 
       - name: Set result filename
         run: |
-          RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }}
+          RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}
           echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV
 
       - name: Launch job script
@@ -77,10 +83,22 @@ jobs:
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh ${{ inputs.exp-name }}
 
       - name: Process result
-        run: python3 utils/process_result.py ${{ inputs.runner }} ${{ env.TP }} ${{ env.RESULT_FILENAME }}
+        run: |
+          RESULT_FILENAME=${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}
+          # Determine framework based on image
+          if [[ "${{ inputs.image }}" == *"tensorrt-llm"* ]]; then
+            FRAMEWORK="TRT-LLM"
+          elif [[ "${{ inputs.image }}" == *"vllm"* ]]; then
+            FRAMEWORK="vLLM"
+          elif [[ "${{ inputs.image }}" == *"sglang"* ]]; then
+            FRAMEWORK="SGLang"
+          else
+            FRAMEWORK="${{ inputs.runner }}"
+          fi
+          python3 utils/process_result.py $FRAMEWORK ${{ env.TP }} $RESULT_FILENAME ${{ env.PRECISION }}
 
       - name: Upload result
         uses: actions/upload-artifact@v4
         with:
-          name: ${{ env.RESULT_FILENAME }}
-          path: agg_${{ env.RESULT_FILENAME }}.json
+          name: ${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ env.PRECISION }}_${{ runner.name }}
+          path: agg_${{ env.EXP_NAME }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ inputs.runner }}.json
diff --git a/.github/workflows/cluster-cleanup.yml b/.github/workflows/cluster-cleanup.yml
@@ -24,7 +24,7 @@ jobs:
           - 'h200-nv_2'
           - 'h200-nv_3'
           - 'b200-nv_0'
-          - 'b200-nv_1'
+          - 'b200-nv_1' 
           - 'mi325x-tw_0'
           - 'mi325x-tw_1'
           - 'mi325x-tw_2'
@@ -47,7 +47,7 @@ jobs:
         runner:
           - 'h100-cr_0'
           - 'h100-cr_1'
-          - 'b200-tg_0'
+          # - 'b200-tg_0'
           - 'mi300x-cr_0'
           - 'mi300x-amd_0'
           - 'mi300x-amd_1'

diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml
@@ -22,7 +22,13 @@ jobs:
         uses: actions/download-artifact@v4
         with:
           path: results/
-          pattern: ${{ inputs.exp-name }}_*
+          pattern: ${{ inputs.exp-name }}*
+
+      - name: Download TRT artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: results/
+          pattern: 70b-trt*
 
       - name: Print summary
         run: python3 utils/summarize.py results/ ${{ inputs.exp-name }} >> $GITHUB_STEP_SUMMARY

diff --git a/.github/workflows/workflow-scheduler.yml b/.github/workflows/workflow-scheduler.yml
@@ -23,59 +23,61 @@ jobs:
       osl: 1024
       max-model-len: 2048
       random-range-ratio: 0.8
-
-  dsr1-1k1k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
 
-  _70b-8k1k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+
+  # dsr1-1k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_1k1k'
+  #     isl: 1024
+  #     osl: 1024
+  #     max-model-len: 2048
+  #     random-range-ratio: 0.8
+
+  # _70b-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+
 
-  dsr1-8k1k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # dsr1-8k1k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
 
-  _70b-1k8k:
-    needs: cleanup
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      timeout: 240
+  # _70b-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+  #     timeout: 240
 
-  dsr1-1k8k:
-    needs: cleanup
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
+  # dsr1-1k8k:
+  #   needs: cleanup
+  #   uses: ./.github/workflows/dsr1-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'dsr1_1k8k'
+  #     isl: 1024
+  #     osl: 8192
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8