From 1ada0dcbc6e6f9d8dc3ad767215492a42020b1e9 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 24 Oct 2025 14:04:19 -0500
Subject: [PATCH 001/149] initial commit based on kimbos edits

---
 .github/workflows/1k1k-sweep.yml        | 37 ++++++++++
 .github/workflows/benchmark-tmpl.yml    | 92 ++++++++-----------------
 benchmarks/dsr1_fp4_b200_trt_slurm.sh   | 35 +++-------
 benchmarks/dsr1_fp8_b200_trt_slurm.sh   | 25 ++-----
 benchmarks/dsr1_fp8_h200_trt_slurm.sh   | 25 ++-----
 benchmarks/gptoss_fp4_b200_trt_slurm.sh | 26 ++++---
 6 files changed, 98 insertions(+), 142 deletions(-)
 create mode 100644 .github/workflows/1k1k-sweep.yml

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
new file mode 100644
index 000000000..e1a103f83
--- /dev/null
+++ b/.github/workflows/1k1k-sweep.yml
@@ -0,0 +1,37 @@
+name: '1K/1K Sweep'
+
+on:
+  workflow_dispatch:
+
+jobs:
+  get-1k1k-configs:
+    runs-on: ubuntu-latest
+    outputs:
+        search-space-config: ${{ steps.get-1k1k-configs.outputs.search-space-config }}
+    steps:
+      - id: get-1k1k-configs
+        run: python utils/print_configs_json.py configs.json 1k1k
+    
+  benchmark:
+    needs: get-1k1k-configs
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        config: ${{ fromJson(needs.get-1k1k-configs.outputs.search-space-config) }}
+    secrets: inherit
+    with:
+      isl: 1024
+      osl: 1024
+      runner: ${{ matrix.config.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      framework: ${{ matrix.config.framework }}
+      precision: ${{ matrix.config.precision }}
+      exp-name: ${{ matrix.config.exp-name }}
+      tp: ${{ matrix.config.tp }}
+      conc: ${{ matrix.config.conc }}
+
+#   collect-results:
+#     needs: benchmark
+#     steps:
\ No newline at end of file
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 313087946..c78dcb602 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -11,10 +11,10 @@ on:
       model:
         required: true
         type: string
-      framework:
+      precision:
         required: true
         type: string
-      precision:
+      framework:
         required: true
         type: string
       exp-name:
@@ -26,18 +26,24 @@ on:
       osl:
         required: true
         type: string
-      max-model-len:
-        required: true
-        type: string
       random-range-ratio:
-        required: true
+        required: false
         type: string
-      tp-list:
+        default: '0.2'
+      tp:
         required: true
         type: string
-      conc-list:
+      ep:
+        required: false
+        type: string
+        default: ''
+      dp-attn:
+        required: false
+        type: boolean
+        default: false
+      conc:
+        required: true
         type: string
-        default: '[4, 8, 16, 32, 64]'
 
 env:
   HF_TOKEN: ${{ secrets.HF_TOKEN }}
@@ -46,69 +52,32 @@ env:
   MODEL: ${{ inputs.model }}
   ISL: ${{ inputs.isl }}
   OSL: ${{ inputs.osl }}
-  MAX_MODEL_LEN: ${{ inputs.max-model-len }}
   RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }}
   IMAGE: ${{ inputs.image }}
   FRAMEWORK: ${{ inputs.framework }}
   PRECISION: ${{ inputs.precision }}
+  TP: ${{ inputs.tp }}
+  EP_SIZE: ${{ inputs.ep }}
+  DP_ATTENTION: ${{ inputs.dp-attn }}
+  CONC: ${{ inputs.conc }}
 
 jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 180
-
-    strategy:
-      fail-fast: false
-      matrix:
-        tp: ${{ fromJson(inputs.tp-list) }}
-        conc: ${{ fromJson(inputs.conc-list) }}
-    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.tp }} conc${{ matrix.conc }}'
-
-    env:
-      TP: ${{ matrix.tp }}
-      CONC: ${{ matrix.conc }}
+    name: '${{ inputs.runner }} ${{ inputs.exp-name }}-${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} dpa-${{ inputs.dp-attn }} conc${{ inputs.conc }}'
 
     steps:
       - name: Resource cleanup
         run: |
           if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
-            host=$(hostname)
-
-            if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then
-              echo "[INFO] Running container-by-container cleanup on $host"
-
-              for cid in $(docker ps -aq); do
-                echo "[INFO] Cleaning container $cid"
-
-                # Try graceful first
-                docker stop -t 90 "$cid" || true
-
-                # Wait until it's really dead
-                docker wait "$cid" >/dev/null 2>&1 || true
-
-                # Force remove if anything lingers
-                docker rm -f "$cid" >/dev/null 2>&1 || true
-              done
-
-              # Give a moment for GPU processes to fully terminate
-              sleep 2
-
-              # Verify GPUs are now idle
-              if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then
-                echo "[WARN] After stop, GPU still busy:"
-                nvidia-smi
-                # Last resort if driver allows and GPUs appear idle otherwise:
-                # nvidia-smi --gpu-reset -i 0,1,2,3,4,5,6,7 2>/dev/null || true
-              fi
-            else
-              echo "[Docker] Cleaning up resources ..."
-              docker ps -aq | xargs -r docker rm -f
-              docker network prune -f
-              while [ -n "$(docker ps -aq)" ]; do
-                docker ps -a
-                sleep 5
-              done
-            fi
+            echo "[Docker] Cleaning up resources ..."
+            docker ps -aq | xargs -r docker rm -f
+            docker network prune -f
+            while [ -n "$(docker ps -aq)" ]; do
+              docker ps -a
+              sleep 5
+            done
           fi
           if command -v squeue >/dev/null 2>&1; then
             echo "[Slurm] Cleaning up resources ..."
@@ -127,7 +96,7 @@ jobs:
       - name: Launch job script
         env:
           RUNNER_NAME: ${{ runner.name }}
-          RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_conc${{ env.CONC }}_${{ runner.name }}
+          RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP }}_conc${{ env.CONC }}_${{ runner.name }}
         run: |
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
           if [ -f "$RESULT_FILENAME.json" ]; then
@@ -139,10 +108,9 @@ jobs:
 
       - name: Process result
         run: |
-          python3 utils/process_result.py ${{ inputs.runner }} $TP $RESULT_FILENAME $FRAMEWORK $PRECISION
-
+          python3 utils/process_result.py ${{ inputs.runner }}
       - name: Upload result
         uses: actions/upload-artifact@v4
         with:
           name: ${{ env.RESULT_FILENAME }}
-          path: agg_${{ env.RESULT_FILENAME }}.json
+          path: agg_${{ env.RESULT_FILENAME }}.json
\ No newline at end of file
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index ffdae541c..d13584078 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -13,69 +13,50 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
+# EP_SIZE
+# DP_ATTENTION
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+# Default
+MOE_BACKEND="TRTLLM"
 
-hf download $MODEL
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
 
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="1"
-MOE_BACKEND="TRTLLM"
-DP_ATTENTION=false
+hf download $MODEL
 
+# ========= Determine MOE_BACKEND based on ISL, OSL, CONC =========
 if [[ "$TP" == "4" ]]; then
     if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-        if [[ $CONC -gt 32 ]]; then
-            EP_SIZE="$TP"
-        fi
         if [[ $CONC -ge 256 ]]; then
-            DP_ATTENTION=true
             MOE_BACKEND="CUTLASS"
         fi
     elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-        if [[ $CONC -gt 32 ]]; then
-            EP_SIZE="$TP"
-        fi
         if [[ $CONC -ge 256 ]]; then
-            DP_ATTENTION=true
             MOE_BACKEND="CUTLASS"
         fi
     elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
         if [[ $CONC -gt 32 ]]; then
-            EP_SIZE="$TP"
-            DP_ATTENTION=true
             MOE_BACKEND="CUTLASS"
         fi
     fi
 elif [[ "$TP" == "8" ]]; then
     if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-        if [[ $CONC -gt 8 ]]; then
-            EP_SIZE="$TP"
-        fi
         if [[ $CONC -ge 256 ]]; then
-            DP_ATTENTION=true
             MOE_BACKEND="CUTLASS"
         fi
     elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-        if [[ $CONC -gt 16 ]]; then
-            EP_SIZE="$TP"
-        fi
         if [[ $CONC -ge 256 ]]; then
-            DP_ATTENTION=true
             MOE_BACKEND="CUTLASS"
         fi
     elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
         if [[ $CONC -gt 32 ]]; then
-            EP_SIZE="$TP"
-            DP_ATTENTION=true
             MOE_BACKEND="CUTLASS"
         fi
     fi
 fi
 
-echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
+echo "MOE_BACKEND set to '$MOE_BACKEND'"
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index e909b954a..6bc8c9fa7 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -13,33 +13,16 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
+# EP_SIZE
+# DP_ATTENTION
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
-
-hf download $MODEL
-
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="$TP"
 MOE_BACKEND="DEEPGEMM"
-DP_ATTENTION=false
 
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -gt 32 ]]; then
-        DP_ATTENTION=true
-    fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC -gt 64 ]]; then
-        DP_ATTENTION=true
-    fi
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -gt 64 ]]; then
-        DP_ATTENTION=true
-    fi
-fi
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
 
-echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
+hf download $MODEL
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 20101e466..5dfdf8617 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -13,33 +13,16 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
+# EP_SIZE
+# DP_ATTENTION
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
-
-hf download $MODEL
-
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="$TP"
 MOE_BACKEND="CUTLASS"
-DP_ATTENTION=false
 
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -gt 64 ]]; then
-        DP_ATTENTION=true
-    fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC -gt 64 ]]; then
-        DP_ATTENTION=true
-    fi
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -gt 32 ]]; then
-        DP_ATTENTION=true
-    fi
-fi
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND: $MOE_BACKEND"
 
-echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
+hf download $MODEL
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index f85f5c13f..4f17d4d4f 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -13,36 +13,34 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
+# EP_SIZE
+# DP_ATTENTION
 
 # GPTOSS TRTLLM Deployment Guide:
 # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION"
 
 hf download $MODEL
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="1"
+# ========= Determine MOE_BACKEND based on ISL, OSL, CONC =========
+# Default
 MOE_BACKEND="TRTLLM"
-DP_ATTENTION=false
 
 # Higher concurrencies: Concurrency >= 256
 #   MoE Backend = CUTLASS
-#   Use DP attention with expert parallel MoE
 if [[ $CONC -ge 256 ]]; then
-    EP_SIZE="$TP"
-    DP_ATTENTION=true
+    MOE_BACKEND="CUTLASS"
 fi
 
-echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
+echo "MOE_BACKEND set to $MOE_BACKEND"
 
 EXTRA_CONFIG_FILE="gptoss-fp4.yml"
 export TRTLLM_ENABLE_PDL=1
-export NCCL_GRAPH_REGISTER=0
 
 cat > $EXTRA_CONFIG_FILE << EOF
 cuda_graph_config:
@@ -50,7 +48,7 @@ cuda_graph_config:
     max_batch_size: $CONC
 enable_attention_dp: $DP_ATTENTION
 kv_cache_config:
-    dtype: fp8
+    dtype: auto
     enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
 print_iter_log: true
@@ -90,6 +88,12 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
 set +x
 while IFS= read -r line; do
     printf '%s\n' "$line"
+    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
+        sleep 5
+        tail -n100 $SERVER_LOG
+        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
+        exit 1
+    fi
     if [[ "$line" == *"Application startup complete"* ]]; then
         break
     fi
@@ -106,4 +110,4 @@ python3 bench_serving/benchmark_serving.py \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
 --result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
+--result-filename $RESULT_FILENAME.json
\ No newline at end of file

From a1b74760a88bf4576504f2d6d65bd1450392df31 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 24 Oct 2025 14:57:41 -0500
Subject: [PATCH 002/149] adding config and python script:

---
 .github/configs/master.json | 1025 +++++++++++++++++++++++++++++++++++
 utils/get_configs.py        |   29 +
 2 files changed, 1054 insertions(+)
 create mode 100644 .github/configs/master.json
 create mode 100644 utils/get_configs.py

diff --git a/.github/configs/master.json b/.github/configs/master.json
new file mode 100644
index 000000000..1706be9ab
--- /dev/null
+++ b/.github/configs/master.json
@@ -0,0 +1,1025 @@
+{
+  "70b-fp8-h100-vllm": {
+    "image": "vllm/vllm-openai:v0.10.2",
+    "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
+    "precision": "fp8",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 2, "conc-start": 64, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 2, "conc-start": 64, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 2, "conc-start": 32, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "70b-fp8-h200-vllm": {
+    "image": "vllm/vllm-openai:v0.10.2",
+    "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
+    "precision": "fp8",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "conc-start": 32, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "conc-start": 64, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 16, "conc-end": 64},
+          {"tp": 2, "conc-start": 16, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "70b-fp8-b200-vllm": {
+    "image": "vllm/vllm-openai:v0.10.2",
+    "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
+    "precision": "fp8",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "conc-start": 32, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "conc-start": 64, "conc-end": 64},
+          {"tp": 4, "conc-start": 16, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 32, "conc-end": 64},
+          {"tp": 2, "conc-start": 16, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 32}
+        ]
+      }
+    ]
+  },
+  "70b-fp8-h200-trt": {
+    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
+    "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
+    "precision": "fp8",
+    "framework": "trt",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 128, "conc-end": 128},
+          {"tp": 2, "conc-start": 64, "conc-end": 128},
+          {"tp": 4, "conc-start": 4, "conc-end": 128},
+          {"tp": 8, "conc-start": 4, "conc-end": 32}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 128, "conc-end": 128},
+          {"tp": 2, "conc-start": 64, "conc-end": 128},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 32}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 16, "conc-end": 128},
+          {"tp": 4, "conc-start": 4, "conc-end": 128},
+          {"tp": 8, "conc-start": 4, "conc-end": 32}
+        ]
+      }
+    ]
+  },
+  "70b-fp8-b200-trt": {
+    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
+    "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
+    "precision": "fp8",
+    "framework": "trt",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 128, "conc-end": 128},
+          {"tp": 2, "conc-start": 64, "conc-end": 128},
+          {"tp": 4, "conc-start": 4, "conc-end": 128},
+          {"tp": 8, "conc-start": 4, "conc-end": 32}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 128, "conc-end": 128},
+          {"tp": 2, "conc-start": 64, "conc-end": 128},
+          {"tp": 4, "conc-start": 16, "conc-end": 128},
+          {"tp": 8, "conc-start": 4, "conc-end": 32}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 32, "conc-end": 128},
+          {"tp": 2, "conc-start": 16, "conc-end": 128},
+          {"tp": 4, "conc-start": 4, "conc-end": 128},
+          {"tp": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      }
+    ]
+  },
+  "70b-fp8-mi300x-vllm": {
+    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
+    "model": "amd/Llama-3.3-70B-Instruct-FP8-KV",
+    "precision": "fp8",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 32, "conc-end": 64},
+          {"tp": 2, "conc-start": 32, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "conc-start": 64, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 32, "conc-end": 64},
+          {"tp": 2, "conc-start": 32, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "70b-fp8-mi325x-vllm": {
+    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
+    "model": "amd/Llama-3.3-70B-Instruct-FP8-KV",
+    "precision": "fp8",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 32, "conc-end": 64},
+          {"tp": 2, "conc-start": 32, "conc-end": 64},
+          {"tp": 4, "conc-start": 32, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 32, "conc-end": 64},
+          {"tp": 2, "conc-start": 32, "conc-end": 64},
+          {"tp": 4, "conc-start": 64, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 16, "conc-end": 64},
+          {"tp": 2, "conc-start": 4, "conc-end": 32},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "70b-fp8-mi355x-vllm": {
+    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
+    "model": "amd/Llama-3.3-70B-Instruct-FP8-KV",
+    "precision": "fp8",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 32, "conc-end": 64},
+          {"tp": 2, "conc-start": 32, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 32, "conc-end": 64},
+          {"tp": 2, "conc-start": 32, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 32, "conc-end": 64},
+          {"tp": 2, "conc-start": 32, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "70b-fp4-b200-vllm": {
+    "image": "vllm/vllm-openai:v0.10.2",
+    "model": "nvidia/Llama-3.3-70B-Instruct-FP4",
+    "precision": "fp4",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "conc-start": 32, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "conc-start": 32, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 32}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 16, "conc-end": 64},
+          {"tp": 2, "conc-start": 16, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 32},
+          {"tp": 8, "conc-start": 4, "conc-end": 8}
+        ]
+      }
+    ]
+  },
+  "70b-fp4-b200-trt": {
+    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
+    "model": "nvidia/Llama-3.3-70B-Instruct-FP4",
+    "precision": "fp4",
+    "framework": "trt",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 128, "conc-end": 128},
+          {"tp": 2, "conc-start": 64, "conc-end": 128},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 128, "conc-end": 128},
+          {"tp": 2, "conc-start": 64, "conc-end": 128},
+          {"tp": 4, "conc-start": 16, "conc-end": 128},
+          {"tp": 8, "conc-start": 4, "conc-end": 32}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 32, "conc-end": 128},
+          {"tp": 2, "conc-start": 16, "conc-end": 128},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      }
+    ]
+  },
+  "70b-fp4-mi355x-vllm": {
+    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
+    "model": "amd/Llama-3.3-70B-Instruct-MXFP4-Preview",
+    "precision": "fp4",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 32, "conc-end": 64},
+          {"tp": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 32, "conc-end": 64},
+          {"tp": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 32, "conc-end": 64},
+          {"tp": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      }
+    ]
+  },
+  "dsr1-fp8-h200-sgl": {
+    "image": "lmsysorg/sglang:v0.5.2rc2-cu126",
+    "model": "deepseek-ai/DeepSeek-R1-0528",
+    "precision": "fp8",
+    "framework": "sglang",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "dsr1-fp8-b200-sgl": {
+    "image": "lmsysorg/sglang:v0.5.3rc1-cu129-b200",
+    "model": "deepseek-ai/DeepSeek-R1-0528",
+    "precision": "fp8",
+    "framework": "sglang",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "dsr1-fp8-h200-trt": {
+    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
+    "model": "deepseek-ai/DeepSeek-R1-0528",
+    "precision": "fp8",
+    "framework": "trt",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 32},
+          {"tp": 8, "ep": 8, "dp-attn": true, "conc-start": 64, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "dsr1-fp8-b200-trt": {
+    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
+    "model": "deepseek-ai/DeepSeek-R1-0528",
+    "precision": "fp8",
+    "framework": "trt",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 32},
+          {"tp": 8, "ep": 8, "dp-attn": true, "conc-start": 64, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "dsr1-fp8-mi300x-sgl": {
+    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915",
+    "model": "deepseek-ai/DeepSeek-R1-0528",
+    "precision": "fp8",
+    "framework": "sglang",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "dsr1-fp8-mi325x-sgl": {
+    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915",
+    "model": "deepseek-ai/DeepSeek-R1-0528",
+    "precision": "fp8",
+    "framework": "sglang",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "dsr1-fp8-mi355x-sgl": {
+    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915",
+    "model": "deepseek-ai/DeepSeek-R1-0528",
+    "precision": "fp8",
+    "framework": "sglang",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "dsr1-fp4-b200-sgl": {
+    "image": "lmsysorg/sglang:v0.5.3rc1-cu129-b200",
+    "model": "nvidia/DeepSeek-R1-0528-FP4-V2",
+    "precision": "fp4",
+    "framework": "sglang",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 128},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 128}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 128},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 128}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 128},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 16}
+        ]
+      }
+    ]
+  },
+  "dsr1-fp4-b200-trt": {
+    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
+    "model": "nvidia/DeepSeek-R1-0528-FP4-V2",
+    "precision": "fp4",
+    "framework": "trt",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32},
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 64, "conc-end": 128},
+          {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 16, "conc-end": 128},
+          {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32},
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 64, "conc-end": 128},
+          {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 16},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 32, "conc-end": 128},
+          {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32},
+          {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 64, "conc-end": 256},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 32},
+          {"tp": 8, "ep": 8, "dp-attn": true, "conc-start": 64, "conc-end": 256}
+        ]
+      }
+    ]
+  },
+  "dsr1-fp4-mi355x-sgl": {
+    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915",
+    "model": "amd/DeepSeek-R1-0528-MXFP4-Preview",
+    "precision": "fp4",
+    "framework": "sglang",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "gptoss-fp4-h100-vllm": {
+    "image": "vllm/vllm-openai:v0.10.2",
+    "model": "openai/gpt-oss-120b",
+    "precision": "fp4",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 32}
+        ]
+      }
+    ]
+  },
+  "gptoss-fp4-h200-vllm": {
+    "image": "vllm/vllm-openai:v0.10.2",
+    "model": "openai/gpt-oss-120b",
+    "precision": "fp4",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64},
+          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 16},
+          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64},
+          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 32}
+        ]
+      }
+    ]
+  },
+  "gptoss-fp4-b200-vllm": {
+    "image": "vllm/vllm-openai:v0.10.2",
+    "model": "openai/gpt-oss-120b",
+    "precision": "fp4",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 8}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 8}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64},
+          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "gptoss-fp4-h200-trt": {
+    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
+    "model": "openai/gpt-oss-120b",
+    "precision": "fp4",
+    "framework": "trt",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 4, "conc-end": 64},
+          {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 32, "conc-end": 64},
+          {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 4, "conc-end": 64},
+          {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
+        ]
+      }
+    ]
+  },
+  "gptoss-fp4-b200-trt": {
+    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
+    "model": "openai/gpt-oss-120b",
+    "precision": "fp4",
+    "framework": "trt",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 8},
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 16, "conc-end": 64},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 8},
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 16, "conc-end": 64},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8}
+        ]
+      }
+    ]
+  },
+  "gptoss-fp4-mi300x-vllm": {
+    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
+    "model": "openai/gpt-oss-120b",
+    "precision": "fp4",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64},
+          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      }
+    ]
+  },
+  "gptoss-fp4-mi325x-vllm": {
+    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
+    "model": "openai/gpt-oss-120b",
+    "precision": "fp4",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64},
+          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
+          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 64, "conc-end": 64},
+          {"tp": 2, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "conc-start": 64, "conc-end": 64},
+          {"tp": 8, "conc-start": 4, "conc-end": 64}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 4, "conc-end": 64},
+          {"tp": 2, "conc-start": 4, "conc-end": 8},
+          {"tp": 4, "conc-start": 4, "conc-end": 8},
+          {"tp": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      }
+    ]
+  },
+  "gptoss-fp4-mi355x-vllm": {
+    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
+    "model": "openai/gpt-oss-120b",
+    "precision": "fp4",
+    "framework": "vllm",
+    "seq-len-configs": [
+      {
+        "isl": 1024,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 16},
+          {"tp": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      },
+      {
+        "isl": 1024,
+        "osl": 8192,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 16},
+          {"tp": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      },
+      {
+        "isl": 8192,
+        "osl": 1024,
+        "bmk-space": [
+          {"tp": 1, "conc-start": 4, "conc-end": 64},
+          {"tp": 4, "conc-start": 4, "conc-end": 16},
+          {"tp": 8, "conc-start": 4, "conc-end": 16}
+        ]
+      }
+    ]
+  }
+}
\ No newline at end of file
diff --git a/utils/get_configs.py b/utils/get_configs.py
new file mode 100644
index 000000000..cf160895b
--- /dev/null
+++ b/utils/get_configs.py
@@ -0,0 +1,29 @@
+import json
+import sys
+
+seq_len_stoi = {
+    "1k1k": (1024, 1024),
+    "1k8k": (1024, 8192),
+    "8k1k": (8192, 1024)
+}
+
+def main():
+    if len(sys.argv) < 3:
+        print(f"Usage: python3 {sys.argv[0]} {{config-file}} {{isl-osl}}")
+        exit(1)
+        
+    config_file = sys.argv[1]
+    seq_len = sys.argv[2]
+    
+    isl, osl = seq_len_stoi.get(seq_len) or (None, None)
+    if not (isl or osl):
+        raise ValueError(f"Input 'isl-osl' must be one of '{', '.join(seq_len_stoi.keys())}'.")
+    
+    try:
+        with open(config_file, 'r') as f:
+            config_data = json.load(f)
+    except Exception as e:
+        raise ValueError(f"Input file '{config_file}' does not exist.")
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From d4747184bf40ab5446b22f7c8dec16d5a4596686 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 24 Oct 2025 15:48:25 -0500
Subject: [PATCH 003/149] adding runner field

---
 .github/configs/master.json | 29 ++++++++++++++++
 utils/get_configs.py        | 66 +++++++++++++++++++++++++++++++++++--
 2 files changed, 93 insertions(+), 2 deletions(-)

diff --git a/.github/configs/master.json b/.github/configs/master.json
index 1706be9ab..274c22512 100644
--- a/.github/configs/master.json
+++ b/.github/configs/master.json
@@ -2,6 +2,7 @@
   "70b-fp8-h100-vllm": {
     "image": "vllm/vllm-openai:v0.10.2",
     "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
+    "runner": "h100",
     "precision": "fp8",
     "framework": "vllm",
     "seq-len-configs": [
@@ -37,6 +38,7 @@
   "70b-fp8-h200-vllm": {
     "image": "vllm/vllm-openai:v0.10.2",
     "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
+    "runner": "h200",
     "precision": "fp8",
     "framework": "vllm",
     "seq-len-configs": [
@@ -75,6 +77,7 @@
   "70b-fp8-b200-vllm": {
     "image": "vllm/vllm-openai:v0.10.2",
     "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
+    "runner": "b200",
     "precision": "fp8",
     "framework": "vllm",
     "seq-len-configs": [
@@ -113,6 +116,7 @@
   "70b-fp8-h200-trt": {
     "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
     "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
+    "runner": "h200-trt",
     "precision": "fp8",
     "framework": "trt",
     "seq-len-configs": [
@@ -150,6 +154,7 @@
   "70b-fp8-b200-trt": {
     "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
     "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
+    "runner": "b200-trt",
     "precision": "fp8",
     "framework": "trt",
     "seq-len-configs": [
@@ -188,6 +193,7 @@
   "70b-fp8-mi300x-vllm": {
     "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
     "model": "amd/Llama-3.3-70B-Instruct-FP8-KV",
+    "runner": "mi300x",
     "precision": "fp8",
     "framework": "vllm",
     "seq-len-configs": [
@@ -226,6 +232,7 @@
   "70b-fp8-mi325x-vllm": {
     "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
     "model": "amd/Llama-3.3-70B-Instruct-FP8-KV",
+    "runner": "mi325x",
     "precision": "fp8",
     "framework": "vllm",
     "seq-len-configs": [
@@ -264,6 +271,7 @@
   "70b-fp8-mi355x-vllm": {
     "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
     "model": "amd/Llama-3.3-70B-Instruct-FP8-KV",
+    "runner": "mi355x",
     "precision": "fp8",
     "framework": "vllm",
     "seq-len-configs": [
@@ -302,6 +310,7 @@
   "70b-fp4-b200-vllm": {
     "image": "vllm/vllm-openai:v0.10.2",
     "model": "nvidia/Llama-3.3-70B-Instruct-FP4",
+    "runner": "b200",
     "precision": "fp4",
     "framework": "vllm",
     "seq-len-configs": [
@@ -340,6 +349,7 @@
   "70b-fp4-b200-trt": {
     "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
     "model": "nvidia/Llama-3.3-70B-Instruct-FP4",
+    "runner": "b200-trt",
     "precision": "fp4",
     "framework": "trt",
     "seq-len-configs": [
@@ -378,6 +388,7 @@
   "70b-fp4-mi355x-vllm": {
     "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
     "model": "amd/Llama-3.3-70B-Instruct-MXFP4-Preview",
+    "runner": "mi355x",
     "precision": "fp4",
     "framework": "vllm",
     "seq-len-configs": [
@@ -416,6 +427,7 @@
   "dsr1-fp8-h200-sgl": {
     "image": "lmsysorg/sglang:v0.5.2rc2-cu126",
     "model": "deepseek-ai/DeepSeek-R1-0528",
+    "runner": "h200",
     "precision": "fp8",
     "framework": "sglang",
     "seq-len-configs": [
@@ -445,6 +457,7 @@
   "dsr1-fp8-b200-sgl": {
     "image": "lmsysorg/sglang:v0.5.3rc1-cu129-b200",
     "model": "deepseek-ai/DeepSeek-R1-0528",
+    "runner": "b200",
     "precision": "fp8",
     "framework": "sglang",
     "seq-len-configs": [
@@ -474,6 +487,7 @@
   "dsr1-fp8-h200-trt": {
     "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
     "model": "deepseek-ai/DeepSeek-R1-0528",
+    "runner": "h200-trt",
     "precision": "fp8",
     "framework": "trt",
     "seq-len-configs": [
@@ -504,6 +518,7 @@
   "dsr1-fp8-b200-trt": {
     "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
     "model": "deepseek-ai/DeepSeek-R1-0528",
+    "runner": "b200-trt",
     "precision": "fp8",
     "framework": "trt",
     "seq-len-configs": [
@@ -534,6 +549,7 @@
   "dsr1-fp8-mi300x-sgl": {
     "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915",
     "model": "deepseek-ai/DeepSeek-R1-0528",
+    "runner": "mi300x",
     "precision": "fp8",
     "framework": "sglang",
     "seq-len-configs": [
@@ -563,6 +579,7 @@
   "dsr1-fp8-mi325x-sgl": {
     "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915",
     "model": "deepseek-ai/DeepSeek-R1-0528",
+    "runner": "mi325x",
     "precision": "fp8",
     "framework": "sglang",
     "seq-len-configs": [
@@ -592,6 +609,7 @@
   "dsr1-fp8-mi355x-sgl": {
     "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915",
     "model": "deepseek-ai/DeepSeek-R1-0528",
+    "runner": "mi355x",
     "precision": "fp8",
     "framework": "sglang",
     "seq-len-configs": [
@@ -621,6 +639,7 @@
   "dsr1-fp4-b200-sgl": {
     "image": "lmsysorg/sglang:v0.5.3rc1-cu129-b200",
     "model": "nvidia/DeepSeek-R1-0528-FP4-V2",
+    "runner": "b200",
     "precision": "fp4",
     "framework": "sglang",
     "seq-len-configs": [
@@ -653,6 +672,7 @@
   "dsr1-fp4-b200-trt": {
     "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
     "model": "nvidia/DeepSeek-R1-0528-FP4-V2",
+    "runner": "b200-trt",
     "precision": "fp4",
     "framework": "trt",
     "seq-len-configs": [
@@ -695,6 +715,7 @@
   "dsr1-fp4-mi355x-sgl": {
     "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915",
     "model": "amd/DeepSeek-R1-0528-MXFP4-Preview",
+    "runner": "mi355x",
     "precision": "fp4",
     "framework": "sglang",
     "seq-len-configs": [
@@ -725,6 +746,7 @@
   "gptoss-fp4-h100-vllm": {
     "image": "vllm/vllm-openai:v0.10.2",
     "model": "openai/gpt-oss-120b",
+    "runner": "h100",
     "precision": "fp4",
     "framework": "vllm",
     "seq-len-configs": [
@@ -760,6 +782,7 @@
   "gptoss-fp4-h200-vllm": {
     "image": "vllm/vllm-openai:v0.10.2",
     "model": "openai/gpt-oss-120b",
+    "runner": "h200",
     "precision": "fp4",
     "framework": "vllm",
     "seq-len-configs": [
@@ -798,6 +821,7 @@
   "gptoss-fp4-b200-vllm": {
     "image": "vllm/vllm-openai:v0.10.2",
     "model": "openai/gpt-oss-120b",
+    "runner": "b200",
     "precision": "fp4",
     "framework": "vllm",
     "seq-len-configs": [
@@ -836,6 +860,7 @@
   "gptoss-fp4-h200-trt": {
     "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
     "model": "openai/gpt-oss-120b",
+    "runner": "h200-trt",
     "precision": "fp4",
     "framework": "trt",
     "seq-len-configs": [
@@ -874,6 +899,7 @@
   "gptoss-fp4-b200-trt": {
     "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
     "model": "openai/gpt-oss-120b",
+    "runner": "b200-trt",
     "precision": "fp4",
     "framework": "trt",
     "seq-len-configs": [
@@ -914,6 +940,7 @@
   "gptoss-fp4-mi300x-vllm": {
     "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
     "model": "openai/gpt-oss-120b",
+    "runner": "mi300x",
     "precision": "fp4",
     "framework": "vllm",
     "seq-len-configs": [
@@ -952,6 +979,7 @@
   "gptoss-fp4-mi325x-vllm": {
     "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
     "model": "openai/gpt-oss-120b",
+    "runner": "mi325x",
     "precision": "fp4",
     "framework": "vllm",
     "seq-len-configs": [
@@ -990,6 +1018,7 @@
   "gptoss-fp4-mi355x-vllm": {
     "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
     "model": "openai/gpt-oss-120b",
+    "runner": "mi355x",
     "precision": "fp4",
     "framework": "vllm",
     "seq-len-configs": [
diff --git a/utils/get_configs.py b/utils/get_configs.py
index cf160895b..787db0b72 100644
--- a/utils/get_configs.py
+++ b/utils/get_configs.py
@@ -9,11 +9,12 @@
 
 def main():
     if len(sys.argv) < 3:
-        print(f"Usage: python3 {sys.argv[0]} {{config-file}} {{isl-osl}}")
+        print(f"Usage: python3 {sys.argv[0]} {{config-file}} {{isl-osl}} [step-size]")
         exit(1)
         
     config_file = sys.argv[1]
     seq_len = sys.argv[2]
+    step_size = int(sys.argv[3]) if len(sys.argv) > 3 else 2
     
     isl, osl = seq_len_stoi.get(seq_len) or (None, None)
     if not (isl or osl):
@@ -22,8 +23,69 @@ def main():
     try:
         with open(config_file, 'r') as f:
             config_data = json.load(f)
-    except Exception as e:
+            assert isinstance(config_data, dict)
+    except FileNotFoundError:
         raise ValueError(f"Input file '{config_file}' does not exist.")
+    
+    matrix_values = []
+    for key, val in config_data.items():
+        seq_len_configs = val.get('seq-len-configs')
+        assert seq_len_configs, f"Missing 'seq-len-configs' for key '{key}'"
+        
+        image = val.get('image')
+        model = val.get('model')
+        precision = val.get('precision')
+        framework = val.get('framework')
+        runner = val.get('runner')
+        bmk_space = val.get('bmk-space')
+        
+        assert None not in (image, model, precision, framework, runner), \
+            f"Missing required fields for key '{key}'"
+        assert bmk_space, f"Missing 'bmk-space' for key '{key}'"
+        
+        # Check if this config has matching sequence lengths
+        matching_seq_config = None
+        for slq in seq_len_configs:
+            if slq.get('isl') == isl and slq.get('osl') == osl:
+                matching_seq_config = slq
+                break
+        
+        if not matching_seq_config:
+            continue  # Skip this config if no matching sequence length
+        
+        # Now flatten the bmk-space
+        for bmk in bmk_space:
+            tp = bmk.get('tp')
+            conc_start = bmk.get('conc-start')
+            conc_end = bmk.get('conc-end')
+            
+            assert None not in (tp, conc_start, conc_end), \
+                f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{key}'"
+            
+            # Generate entries for each concurrency value in the range
+            conc = conc_start
+            while conc <= conc_end:
+                entry = {
+                    'image': image,
+                    'model': model,
+                    'precision': precision,
+                    'framework': framework,
+                    'runner': runner,
+                    'isl': isl,
+                    'osl': osl,
+                    'tp': tp,
+                    'conc': conc
+                }
+                matrix_values.append(entry)
+                
+                if conc == conc_end:
+                    break
+                conc *= step_size
+                if conc > conc_end:
+                    conc = conc_end  # Ensure we hit the end value
+    
+    print(json.dumps(matrix_values))
+    return matrix_values
 
 if __name__ == "__main__":
     main()
\ No newline at end of file

From 346b10d97de81703f77adf18c0f35e095ca1ef76 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 24 Oct 2025 16:15:18 -0500
Subject: [PATCH 004/149] finishing up script, ready for testing

---
 .github/workflows/1k1k-sweep.yml |  7 ++++++-
 utils/get_configs.py             | 19 ++++++++++++++-----
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index e1a103f83..3d01ceabf 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -9,8 +9,13 @@ jobs:
     outputs:
         search-space-config: ${{ steps.get-1k1k-configs.outputs.search-space-config }}
     steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+      
       - id: get-1k1k-configs
-        run: python utils/print_configs_json.py configs.json 1k1k
+        run: |
+          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/print_configs_json.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k)
+          echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
     
   benchmark:
     needs: get-1k1k-configs
diff --git a/utils/get_configs.py b/utils/get_configs.py
index 787db0b72..9fa911d3e 100644
--- a/utils/get_configs.py
+++ b/utils/get_configs.py
@@ -37,11 +37,9 @@ def main():
         precision = val.get('precision')
         framework = val.get('framework')
         runner = val.get('runner')
-        bmk_space = val.get('bmk-space')
         
         assert None not in (image, model, precision, framework, runner), \
             f"Missing required fields for key '{key}'"
-        assert bmk_space, f"Missing 'bmk-space' for key '{key}'"
         
         # Check if this config has matching sequence lengths
         matching_seq_config = None
@@ -51,13 +49,17 @@ def main():
                 break
         
         if not matching_seq_config:
-            continue  # Skip this config if no matching sequence length
+            continue  # Skip this config if no matching sequence length, this is possible
+        
+        bmk_space = matching_seq_config.get('bmk-space')
+        assert bmk_space, f"Missing 'bmk-space' in matching seq-len-config for key '{key}'"
         
-        # Now flatten the bmk-space
         for bmk in bmk_space:
             tp = bmk.get('tp')
             conc_start = bmk.get('conc-start')
             conc_end = bmk.get('conc-end')
+            ep = bmk.get('ep')
+            dp_attn = bmk.get('dp-attn')
             
             assert None not in (tp, conc_start, conc_end), \
                 f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{key}'"
@@ -76,13 +78,20 @@ def main():
                     'tp': tp,
                     'conc': conc
                 }
+                
+                # Add optional fields if they exist
+                if ep is not None:
+                    entry['ep'] = ep
+                if dp_attn is not None:
+                    entry['dp-attn'] = dp_attn
+                
                 matrix_values.append(entry)
                 
                 if conc == conc_end:
                     break
                 conc *= step_size
                 if conc > conc_end:
-                    conc = conc_end  # Ensure we hit the end value
+                    conc = conc_end 
     
     print(json.dumps(matrix_values))
     return matrix_values

From 0dc246c85eafc50da9b2c5b434e5e7d5c495c43d Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 24 Oct 2025 16:22:09 -0500
Subject: [PATCH 005/149] testing purposes

---
 .github/workflows/1k1k-sweep.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 3d01ceabf..c0bbbf57e 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -1,6 +1,7 @@
 name: '1K/1K Sweep'
 
 on:
+  pull_request:
   workflow_dispatch:
 
 jobs:

From 02f57924c32cd142ba8a7064e35358e015bebf39 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 24 Oct 2025 16:22:50 -0500
Subject: [PATCH 006/149] testing purposes

---
 .github/workflows/1k1k-sweep.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index c0bbbf57e..73bb9d5a6 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -15,7 +15,7 @@ jobs:
       
       - id: get-1k1k-configs
         run: |
-          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/print_configs_json.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k)
+          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k)
           echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
     
   benchmark:

From e93d20bb7f6716daa0a161fa94cafba37555d5de Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 24 Oct 2025 16:43:09 -0500
Subject: [PATCH 007/149] refactoring more

---
 .github/workflows/1k1k-sweep.yml | 90 ++++++++++++++++++++++++++++----
 utils/get_configs.py             | 13 +++--
 2 files changed, 88 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 73bb9d5a6..3e199572d 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -5,26 +5,92 @@ on:
   workflow_dispatch:
 
 jobs:
-  get-1k1k-configs:
+  get-70b-configs:
     runs-on: ubuntu-latest
     outputs:
-        search-space-config: ${{ steps.get-1k1k-configs.outputs.search-space-config }}
+        search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }}
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
-      
-      - id: get-1k1k-configs
+
+      - id: get-70b-configs
+        run: |
+          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k 70b)
+          echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+  get-dsr1-configs:
+    runs-on: ubuntu-latest
+    outputs:
+        search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - id: get-dsr1-configs
         run: |
-          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k)
+          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k dsr1)
           echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
-    
-  benchmark:
-    needs: get-1k1k-configs
+
+  get-gptoss-configs:
+    runs-on: ubuntu-latest
+    outputs:
+        search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - id: get-gptoss-configs
+        run: |
+          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k gptoss)
+          echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+  benchmark-70b:
+    needs: get-70b-configs
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }}
+    secrets: inherit
+    with:
+      isl: 1024
+      osl: 1024
+      runner: ${{ matrix.config.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      framework: ${{ matrix.config.framework }}
+      precision: ${{ matrix.config.precision }}
+      exp-name: ${{ matrix.config.exp-name }}
+      tp: ${{ matrix.config.tp }}
+      conc: ${{ matrix.config.conc }}
+
+  benchmark-dsr1:
+    needs: get-dsr1-configs
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    strategy:
+      fail-fast: false
+      matrix:
+        config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }}
+    secrets: inherit
+    with:
+      isl: 1024
+      osl: 1024
+      runner: ${{ matrix.config.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      framework: ${{ matrix.config.framework }}
+      precision: ${{ matrix.config.precision }}
+      exp-name: ${{ matrix.config.exp-name }}
+      tp: ${{ matrix.config.tp }}
+      conc: ${{ matrix.config.conc }}
+
+  benchmark-gptoss:
+    needs: get-gptoss-configs
     uses: ./.github/workflows/benchmark-tmpl.yml
     strategy:
       fail-fast: false
       matrix:
-        config: ${{ fromJson(needs.get-1k1k-configs.outputs.search-space-config) }}
+        config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }}
     secrets: inherit
     with:
       isl: 1024
@@ -39,5 +105,7 @@ jobs:
       conc: ${{ matrix.config.conc }}
 
 #   collect-results:
-#     needs: benchmark
-#     steps:
\ No newline at end of file
+#     needs: [benchmark-70b, benchmark-dsr1, benchmark-gptoss]
+#     uses: ./.github/workflows/collect-results.yml
+#     with:
+#       exp-name: 1k1k
\ No newline at end of file
diff --git a/utils/get_configs.py b/utils/get_configs.py
index 9fa911d3e..7aec991b0 100644
--- a/utils/get_configs.py
+++ b/utils/get_configs.py
@@ -8,13 +8,14 @@
 }
 
 def main():
-    if len(sys.argv) < 3:
-        print(f"Usage: python3 {sys.argv[0]} {{config-file}} {{isl-osl}} [step-size]")
+    if len(sys.argv) < 4:
+        print(f"Usage: python3 {sys.argv[0]} {{config-file}} {{isl-osl}} {{model-prefix}} [step-size]")
         exit(1)
-        
+
     config_file = sys.argv[1]
     seq_len = sys.argv[2]
-    step_size = int(sys.argv[3]) if len(sys.argv) > 3 else 2
+    model_prefix = sys.argv[3]
+    step_size = int(sys.argv[4]) if len(sys.argv) > 4 else 2
     
     isl, osl = seq_len_stoi.get(seq_len) or (None, None)
     if not (isl or osl):
@@ -29,6 +30,10 @@ def main():
     
     matrix_values = []
     for key, val in config_data.items():
+        # Filter by model prefix
+        if not key.startswith(model_prefix):
+            continue
+
         seq_len_configs = val.get('seq-len-configs')
         assert seq_len_configs, f"Missing 'seq-len-configs' for key '{key}'"
         

From 88239ac6400dff146621c62fb8d164f4fba73b34 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 24 Oct 2025 16:48:51 -0500
Subject: [PATCH 008/149] refactoring more

---
 .github/workflows/benchmark-tmpl.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index c78dcb602..4a21825d5 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -65,8 +65,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 180
-    name: '${{ inputs.runner }} ${{ inputs.exp-name }}-${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} dpa-${{ inputs.dp-attn }} conc${{ inputs.conc }}'
-
+    name: '${{ inputs.exp-name }} ${{ inputs.runner }}-${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} dpa-${{ inputs.dp-attn }} conc${{ inputs.conc }}'
     steps:
       - name: Resource cleanup
         run: |

From f00e47da557329fbbf0ae373de290a9e2e7b8628 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 24 Oct 2025 16:53:21 -0500
Subject: [PATCH 009/149] refactoring more

---
 .github/workflows/1k1k-sweep.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 3e199572d..512ae819c 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -53,6 +53,7 @@ jobs:
         config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }}
     secrets: inherit
     with:
+      exp-name: "70b_1k1k"
       isl: 1024
       osl: 1024
       runner: ${{ matrix.config.runner }}
@@ -60,7 +61,6 @@ jobs:
       model: ${{ matrix.config.model }}
       framework: ${{ matrix.config.framework }}
       precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
       tp: ${{ matrix.config.tp }}
       conc: ${{ matrix.config.conc }}
 
@@ -73,6 +73,7 @@ jobs:
         config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }}
     secrets: inherit
     with:
+      exp-name: "dsr1_1k1k"
       isl: 1024
       osl: 1024
       runner: ${{ matrix.config.runner }}
@@ -80,7 +81,6 @@ jobs:
       model: ${{ matrix.config.model }}
       framework: ${{ matrix.config.framework }}
       precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
       tp: ${{ matrix.config.tp }}
       conc: ${{ matrix.config.conc }}
 
@@ -93,6 +93,7 @@ jobs:
         config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }}
     secrets: inherit
     with:
+      exp-name: "gptoss_1k1k"
       isl: 1024
       osl: 1024
       runner: ${{ matrix.config.runner }}
@@ -100,7 +101,6 @@ jobs:
       model: ${{ matrix.config.model }}
       framework: ${{ matrix.config.framework }}
       precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
       tp: ${{ matrix.config.tp }}
       conc: ${{ matrix.config.conc }}
 

From 8cc9eebd6ab0879092c759d120747886b1e5771a Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 24 Oct 2025 16:54:48 -0500
Subject: [PATCH 010/149] refactoring more

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 4a21825d5..5f52c94ef 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -65,7 +65,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 180
-    name: '${{ inputs.exp-name }} ${{ inputs.runner }}-${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} dpa-${{ inputs.dp-attn }} conc${{ inputs.conc }}'
+    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} conc${{ inputs.conc }}'
     steps:
       - name: Resource cleanup
         run: |

From 7be26739febd03147baf233dd82ab1b280679751 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sat, 25 Oct 2025 15:17:29 -0500
Subject: [PATCH 011/149] refactoring more

---
 .github/workflows/1k1k-sweep.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 512ae819c..85a175a4a 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -47,6 +47,7 @@ jobs:
   benchmark-70b:
     needs: get-70b-configs
     uses: ./.github/workflows/benchmark-tmpl.yml
+    name: 70b 1k1k
     strategy:
       fail-fast: false
       matrix:
@@ -67,6 +68,7 @@ jobs:
   benchmark-dsr1:
     needs: get-dsr1-configs
     uses: ./.github/workflows/benchmark-tmpl.yml
+    name: dsr1 1k1k
     strategy:
       fail-fast: false
       matrix:
@@ -87,6 +89,7 @@ jobs:
   benchmark-gptoss:
     needs: get-gptoss-configs
     uses: ./.github/workflows/benchmark-tmpl.yml
+    name: gptoss 1k1k
     strategy:
       fail-fast: false
       matrix:

From f9c5e2757e1d617a2f79d80db9a124f91e500b3b Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sat, 25 Oct 2025 15:24:29 -0500
Subject: [PATCH 012/149] refactoring more

---
 .github/workflows/1k1k-sweep.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 85a175a4a..2e6c3cffa 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -57,6 +57,7 @@ jobs:
       exp-name: "70b_1k1k"
       isl: 1024
       osl: 1024
+      max-model-len: 2048
       runner: ${{ matrix.config.runner }}
       image: ${{ matrix.config.image }}
       model: ${{ matrix.config.model }}
@@ -78,6 +79,7 @@ jobs:
       exp-name: "dsr1_1k1k"
       isl: 1024
       osl: 1024
+      max-model-len: 2048
       runner: ${{ matrix.config.runner }}
       image: ${{ matrix.config.image }}
       model: ${{ matrix.config.model }}
@@ -99,6 +101,7 @@ jobs:
       exp-name: "gptoss_1k1k"
       isl: 1024
       osl: 1024
+      max-model-len: 2048
       runner: ${{ matrix.config.runner }}
       image: ${{ matrix.config.image }}
       model: ${{ matrix.config.model }}

From bb460c7d3516b772e072f3830c0a7a91f385ad18 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sat, 25 Oct 2025 15:25:14 -0500
Subject: [PATCH 013/149] refactoring more

---
 .github/workflows/benchmark-tmpl.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 5f52c94ef..d785e32e5 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -41,6 +41,9 @@ on:
         required: false
         type: boolean
         default: false
+      max-model-len:
+        required: true
+        type: string
       conc:
         required: true
         type: string
@@ -52,6 +55,7 @@ env:
   MODEL: ${{ inputs.model }}
   ISL: ${{ inputs.isl }}
   OSL: ${{ inputs.osl }}
+  MAX_MODEL_LEN: ${{ inputs.max-model-len }}
   RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }}
   IMAGE: ${{ inputs.image }}
   FRAMEWORK: ${{ inputs.framework }}

From 2a5658adbbd971ec7400676301890e3a03fc352b Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sat, 25 Oct 2025 15:29:49 -0500
Subject: [PATCH 014/149] refactoring more

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index d785e32e5..90df56641 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -36,7 +36,7 @@ on:
       ep:
         required: false
         type: string
-        default: ''
+        default: '1'
       dp-attn:
         required: false
         type: boolean

From 15da179aef6662a3101e462337d19ffd8e104553 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sat, 25 Oct 2025 15:39:29 -0500
Subject: [PATCH 015/149] refactoring more

---
 .github/workflows/benchmark-tmpl.yml |  2 +-
 utils/process_result.py              | 23 +++++++++++------------
 2 files changed, 12 insertions(+), 13 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 90df56641..66373a2f5 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -111,7 +111,7 @@ jobs:
 
       - name: Process result
         run: |
-          python3 utils/process_result.py ${{ inputs.runner }}
+          python3 utils/process_result.py ${{ inputs.runner }} $TP $EP_SIZE $DP_ATTENTION $RESULT_FILENAME $FRAMEWORK $PRECISION
       - name: Upload result
         uses: actions/upload-artifact@v4
         with:
diff --git a/utils/process_result.py b/utils/process_result.py
index aaf8ac0d2..a59d1f7f3 100644
--- a/utils/process_result.py
+++ b/utils/process_result.py
@@ -5,31 +5,30 @@
 
 hw = sys.argv[1]
 tp_size = int(sys.argv[2])
-result_filename = sys.argv[3]
-framework = sys.argv[4]
-precision = sys.argv[5]
+ep_size = int(sys.argv[3])
+dp_attention = sys.argv[4]
+result_filename = sys.argv[5]
+framework = sys.argv[6]
+precision = sys.argv[7]
 
 with open(f'{result_filename}.json') as f:
     bmk_result = json.load(f)
 
-tput_per_gpu = float(bmk_result['total_token_throughput']) / tp_size
-output_tput_per_gpu = float(bmk_result['output_throughput']) / tp_size
-input_tput_per_gpu = tput_per_gpu - output_tput_per_gpu
-
 data = {
     'hw': hw,
     'tp': tp_size,
+    'ep': ep_size,
     'conc': int(bmk_result['max_concurrency']),
+    'dp_attention': dp_attention, # true or false
     'model': bmk_result['model_id'],
     'framework': framework,
     'precision': precision,
-    'tput_per_gpu': tput_per_gpu,
-    'output_tput_per_gpu': output_tput_per_gpu,
-    'input_tput_per_gpu': input_tput_per_gpu
+    'tput_per_gpu': float(bmk_result['total_token_throughput']) / tp_size,
+    'output_tput_per_gpu': float(bmk_result['output_throughput']) / tp_size
 }
 
-if len(sys.argv) == 7:  # MTP
-    data['mtp'] = sys.argv[6]
+if len(sys.argv) == 9:  # MTP
+    data['mtp'] = sys.argv[8]
 
 for key, value in bmk_result.items():
     if key.endswith('ms'):

From 9bf6b1fdf5facb079b8a5d6d76eb473bfdeed8a9 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sat, 25 Oct 2025 16:07:09 -0500
Subject: [PATCH 016/149] refactoring more

---
 .github/configs/master.json             |  6 ++----
 .github/workflows/benchmark-tmpl.yml    |  2 +-
 benchmarks/gptoss_fp4_b200_trt_slurm.sh | 25 ++++++++++---------------
 3 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/.github/configs/master.json b/.github/configs/master.json
index 274c22512..00a2c0a17 100644
--- a/.github/configs/master.json
+++ b/.github/configs/master.json
@@ -909,8 +909,7 @@
         "bmk-space": [
           {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64},
           {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 8},
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 16, "conc-end": 64},
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64},
           {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8}
         ]
       },
@@ -920,8 +919,7 @@
         "bmk-space": [
           {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64},
           {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 8},
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 16, "conc-end": 64},
+          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64},
           {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8}
         ]
       },
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 66373a2f5..6cfb692fe 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -99,7 +99,7 @@ jobs:
       - name: Launch job script
         env:
           RUNNER_NAME: ${{ runner.name }}
-          RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP }}_conc${{ env.CONC }}_${{ runner.name }}
+          RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_conc${{ env.CONC }}_${{ runner.name }}
         run: |
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
           if [ -f "$RESULT_FILENAME.json" ]; then
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index 4f17d4d4f..6b2f251dd 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -21,26 +21,27 @@
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION"
+MOE_BACKEND="TRTLLM"
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, DP_ATTENTION: $DP_ATTENTION, MOE_BACKEND: $MOE_BACKEND"
 
 hf download $MODEL
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 
-# ========= Determine MOE_BACKEND based on ISL, OSL, CONC =========
-# Default
-MOE_BACKEND="TRTLLM"
+# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
 
 # Higher concurrencies: Concurrency >= 256
 #   MoE Backend = CUTLASS
+#   Use DP attention with expert parallel MoE
 if [[ $CONC -ge 256 ]]; then
-    MOE_BACKEND="CUTLASS"
+    EP_SIZE="$TP"
+    DP_ATTENTION=true
 fi
 
-echo "MOE_BACKEND set to $MOE_BACKEND"
-
 EXTRA_CONFIG_FILE="gptoss-fp4.yml"
 export TRTLLM_ENABLE_PDL=1
+export NCCL_GRAPH_REGISTER=0
 
 cat > $EXTRA_CONFIG_FILE << EOF
 cuda_graph_config:
@@ -48,7 +49,7 @@ cuda_graph_config:
     max_batch_size: $CONC
 enable_attention_dp: $DP_ATTENTION
 kv_cache_config:
-    dtype: auto
+    dtype: fp8
     enable_block_reuse: false
     free_gpu_memory_fraction: 0.85
 print_iter_log: true
@@ -88,12 +89,6 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
 set +x
 while IFS= read -r line; do
     printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-        sleep 5
-        tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
     if [[ "$line" == *"Application startup complete"* ]]; then
         break
     fi
@@ -110,4 +105,4 @@ python3 bench_serving/benchmark_serving.py \
 --request-rate inf --ignore-eos \
 --save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
 --result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
\ No newline at end of file
+--result-filename $RESULT_FILENAME.json

From 8d330cd0a7b00a6ca1457e2d3a10fe44623d8209 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sat, 25 Oct 2025 16:09:17 -0500
Subject: [PATCH 017/149] refactoring more

---
 .github/configs/master.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/master.json b/.github/configs/master.json
index 00a2c0a17..d42b98b46 100644
--- a/.github/configs/master.json
+++ b/.github/configs/master.json
@@ -897,7 +897,7 @@
     ]
   },
   "gptoss-fp4-b200-trt": {
-    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
+    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1",
     "model": "openai/gpt-oss-120b",
     "runner": "b200-trt",
     "precision": "fp4",

From 8f665ddade9354db612b008f75e59be7b5ec8e6c Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 12:13:39 -0500
Subject: [PATCH 018/149] updating the benchmark files with logic

---
 .github/configs/master.json             | 1052 -----------------------
 .github/configs/master.yaml             |  784 +++++++++++++++++
 benchmarks/dsr1_fp4_b200_trt_slurm.sh   |   11 +-
 benchmarks/dsr1_fp8_b200_trt_slurm.sh   |   11 +-
 benchmarks/dsr1_fp8_h200_trt_slurm.sh   |   11 +-
 benchmarks/gptoss_fp4_b200_trt_slurm.sh |   15 +-
 utils/get_configs.py                    |    3 +-
 7 files changed, 809 insertions(+), 1078 deletions(-)
 delete mode 100644 .github/configs/master.json
 create mode 100644 .github/configs/master.yaml

diff --git a/.github/configs/master.json b/.github/configs/master.json
deleted file mode 100644
index d42b98b46..000000000
--- a/.github/configs/master.json
+++ /dev/null
@@ -1,1052 +0,0 @@
-{
-  "70b-fp8-h100-vllm": {
-    "image": "vllm/vllm-openai:v0.10.2",
-    "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
-    "runner": "h100",
-    "precision": "fp8",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 2, "conc-start": 64, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 2, "conc-start": 64, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 2, "conc-start": 32, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "70b-fp8-h200-vllm": {
-    "image": "vllm/vllm-openai:v0.10.2",
-    "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
-    "runner": "h200",
-    "precision": "fp8",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "conc-start": 32, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "conc-start": 64, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 16, "conc-end": 64},
-          {"tp": 2, "conc-start": 16, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "70b-fp8-b200-vllm": {
-    "image": "vllm/vllm-openai:v0.10.2",
-    "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
-    "runner": "b200",
-    "precision": "fp8",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "conc-start": 32, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "conc-start": 64, "conc-end": 64},
-          {"tp": 4, "conc-start": 16, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 32, "conc-end": 64},
-          {"tp": 2, "conc-start": 16, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 32}
-        ]
-      }
-    ]
-  },
-  "70b-fp8-h200-trt": {
-    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
-    "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
-    "runner": "h200-trt",
-    "precision": "fp8",
-    "framework": "trt",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 128, "conc-end": 128},
-          {"tp": 2, "conc-start": 64, "conc-end": 128},
-          {"tp": 4, "conc-start": 4, "conc-end": 128},
-          {"tp": 8, "conc-start": 4, "conc-end": 32}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 128, "conc-end": 128},
-          {"tp": 2, "conc-start": 64, "conc-end": 128},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 32}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 16, "conc-end": 128},
-          {"tp": 4, "conc-start": 4, "conc-end": 128},
-          {"tp": 8, "conc-start": 4, "conc-end": 32}
-        ]
-      }
-    ]
-  },
-  "70b-fp8-b200-trt": {
-    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
-    "model": "nvidia/Llama-3.3-70B-Instruct-FP8",
-    "runner": "b200-trt",
-    "precision": "fp8",
-    "framework": "trt",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 128, "conc-end": 128},
-          {"tp": 2, "conc-start": 64, "conc-end": 128},
-          {"tp": 4, "conc-start": 4, "conc-end": 128},
-          {"tp": 8, "conc-start": 4, "conc-end": 32}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 128, "conc-end": 128},
-          {"tp": 2, "conc-start": 64, "conc-end": 128},
-          {"tp": 4, "conc-start": 16, "conc-end": 128},
-          {"tp": 8, "conc-start": 4, "conc-end": 32}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 32, "conc-end": 128},
-          {"tp": 2, "conc-start": 16, "conc-end": 128},
-          {"tp": 4, "conc-start": 4, "conc-end": 128},
-          {"tp": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      }
-    ]
-  },
-  "70b-fp8-mi300x-vllm": {
-    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
-    "model": "amd/Llama-3.3-70B-Instruct-FP8-KV",
-    "runner": "mi300x",
-    "precision": "fp8",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 32, "conc-end": 64},
-          {"tp": 2, "conc-start": 32, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "conc-start": 64, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 32, "conc-end": 64},
-          {"tp": 2, "conc-start": 32, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "70b-fp8-mi325x-vllm": {
-    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
-    "model": "amd/Llama-3.3-70B-Instruct-FP8-KV",
-    "runner": "mi325x",
-    "precision": "fp8",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 32, "conc-end": 64},
-          {"tp": 2, "conc-start": 32, "conc-end": 64},
-          {"tp": 4, "conc-start": 32, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 32, "conc-end": 64},
-          {"tp": 2, "conc-start": 32, "conc-end": 64},
-          {"tp": 4, "conc-start": 64, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 16, "conc-end": 64},
-          {"tp": 2, "conc-start": 4, "conc-end": 32},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "70b-fp8-mi355x-vllm": {
-    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
-    "model": "amd/Llama-3.3-70B-Instruct-FP8-KV",
-    "runner": "mi355x",
-    "precision": "fp8",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 32, "conc-end": 64},
-          {"tp": 2, "conc-start": 32, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 32, "conc-end": 64},
-          {"tp": 2, "conc-start": 32, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 32, "conc-end": 64},
-          {"tp": 2, "conc-start": 32, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "70b-fp4-b200-vllm": {
-    "image": "vllm/vllm-openai:v0.10.2",
-    "model": "nvidia/Llama-3.3-70B-Instruct-FP4",
-    "runner": "b200",
-    "precision": "fp4",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "conc-start": 32, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "conc-start": 32, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 32}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 16, "conc-end": 64},
-          {"tp": 2, "conc-start": 16, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 32},
-          {"tp": 8, "conc-start": 4, "conc-end": 8}
-        ]
-      }
-    ]
-  },
-  "70b-fp4-b200-trt": {
-    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
-    "model": "nvidia/Llama-3.3-70B-Instruct-FP4",
-    "runner": "b200-trt",
-    "precision": "fp4",
-    "framework": "trt",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 128, "conc-end": 128},
-          {"tp": 2, "conc-start": 64, "conc-end": 128},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 128, "conc-end": 128},
-          {"tp": 2, "conc-start": 64, "conc-end": 128},
-          {"tp": 4, "conc-start": 16, "conc-end": 128},
-          {"tp": 8, "conc-start": 4, "conc-end": 32}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 32, "conc-end": 128},
-          {"tp": 2, "conc-start": 16, "conc-end": 128},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      }
-    ]
-  },
-  "70b-fp4-mi355x-vllm": {
-    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
-    "model": "amd/Llama-3.3-70B-Instruct-MXFP4-Preview",
-    "runner": "mi355x",
-    "precision": "fp4",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 32, "conc-end": 64},
-          {"tp": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 32, "conc-end": 64},
-          {"tp": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 32, "conc-end": 64},
-          {"tp": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      }
-    ]
-  },
-  "dsr1-fp8-h200-sgl": {
-    "image": "lmsysorg/sglang:v0.5.2rc2-cu126",
-    "model": "deepseek-ai/DeepSeek-R1-0528",
-    "runner": "h200",
-    "precision": "fp8",
-    "framework": "sglang",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "dsr1-fp8-b200-sgl": {
-    "image": "lmsysorg/sglang:v0.5.3rc1-cu129-b200",
-    "model": "deepseek-ai/DeepSeek-R1-0528",
-    "runner": "b200",
-    "precision": "fp8",
-    "framework": "sglang",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "dsr1-fp8-h200-trt": {
-    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
-    "model": "deepseek-ai/DeepSeek-R1-0528",
-    "runner": "h200-trt",
-    "precision": "fp8",
-    "framework": "trt",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 32},
-          {"tp": 8, "ep": 8, "dp-attn": true, "conc-start": 64, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "dsr1-fp8-b200-trt": {
-    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
-    "model": "deepseek-ai/DeepSeek-R1-0528",
-    "runner": "b200-trt",
-    "precision": "fp8",
-    "framework": "trt",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 32},
-          {"tp": 8, "ep": 8, "dp-attn": true, "conc-start": 64, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "dsr1-fp8-mi300x-sgl": {
-    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915",
-    "model": "deepseek-ai/DeepSeek-R1-0528",
-    "runner": "mi300x",
-    "precision": "fp8",
-    "framework": "sglang",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "dsr1-fp8-mi325x-sgl": {
-    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915",
-    "model": "deepseek-ai/DeepSeek-R1-0528",
-    "runner": "mi325x",
-    "precision": "fp8",
-    "framework": "sglang",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "dsr1-fp8-mi355x-sgl": {
-    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915",
-    "model": "deepseek-ai/DeepSeek-R1-0528",
-    "runner": "mi355x",
-    "precision": "fp8",
-    "framework": "sglang",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "dsr1-fp4-b200-sgl": {
-    "image": "lmsysorg/sglang:v0.5.3rc1-cu129-b200",
-    "model": "nvidia/DeepSeek-R1-0528-FP4-V2",
-    "runner": "b200",
-    "precision": "fp4",
-    "framework": "sglang",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 128},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 128}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 128},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 128}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 128},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 16}
-        ]
-      }
-    ]
-  },
-  "dsr1-fp4-b200-trt": {
-    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
-    "model": "nvidia/DeepSeek-R1-0528-FP4-V2",
-    "runner": "b200-trt",
-    "precision": "fp4",
-    "framework": "trt",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32},
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 64, "conc-end": 128},
-          {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 16, "conc-end": 128},
-          {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32},
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 64, "conc-end": 128},
-          {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 16},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 32, "conc-end": 128},
-          {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 256, "conc-end": 256}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32},
-          {"tp": 4, "ep": 4, "dp-attn": true, "conc-start": 64, "conc-end": 256},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 32},
-          {"tp": 8, "ep": 8, "dp-attn": true, "conc-start": 64, "conc-end": 256}
-        ]
-      }
-    ]
-  },
-  "dsr1-fp4-mi355x-sgl": {
-    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915",
-    "model": "amd/DeepSeek-R1-0528-MXFP4-Preview",
-    "runner": "mi355x",
-    "precision": "fp4",
-    "framework": "sglang",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "gptoss-fp4-h100-vllm": {
-    "image": "vllm/vllm-openai:v0.10.2",
-    "model": "openai/gpt-oss-120b",
-    "runner": "h100",
-    "precision": "fp4",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 32}
-        ]
-      }
-    ]
-  },
-  "gptoss-fp4-h200-vllm": {
-    "image": "vllm/vllm-openai:v0.10.2",
-    "model": "openai/gpt-oss-120b",
-    "runner": "h200",
-    "precision": "fp4",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64},
-          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 16},
-          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64},
-          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 32}
-        ]
-      }
-    ]
-  },
-  "gptoss-fp4-b200-vllm": {
-    "image": "vllm/vllm-openai:v0.10.2",
-    "model": "openai/gpt-oss-120b",
-    "runner": "b200",
-    "precision": "fp4",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 8}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 8}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64},
-          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "gptoss-fp4-h200-trt": {
-    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
-    "model": "openai/gpt-oss-120b",
-    "runner": "h200-trt",
-    "precision": "fp4",
-    "framework": "trt",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 32},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 32, "conc-end": 64},
-          {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 64}
-        ]
-      }
-    ]
-  },
-  "gptoss-fp4-b200-trt": {
-    "image": "nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1",
-    "model": "openai/gpt-oss-120b",
-    "runner": "b200-trt",
-    "precision": "fp4",
-    "framework": "trt",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "dp-attn": false, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "ep": 2, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "dp-attn": false, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "dp-attn": false, "conc-start": 4, "conc-end": 8}
-        ]
-      }
-    ]
-  },
-  "gptoss-fp4-mi300x-vllm": {
-    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
-    "model": "openai/gpt-oss-120b",
-    "runner": "mi300x",
-    "precision": "fp4",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64},
-          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      }
-    ]
-  },
-  "gptoss-fp4-mi325x-vllm": {
-    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
-    "model": "openai/gpt-oss-120b",
-    "runner": "mi325x",
-    "precision": "fp4",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "ep": 1, "conc-start": 4, "conc-end": 64},
-          {"tp": 2, "ep": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "ep": 4, "conc-start": 4, "conc-end": 64},
-          {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 64, "conc-end": 64},
-          {"tp": 2, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "conc-start": 64, "conc-end": 64},
-          {"tp": 8, "conc-start": 4, "conc-end": 64}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 4, "conc-end": 64},
-          {"tp": 2, "conc-start": 4, "conc-end": 8},
-          {"tp": 4, "conc-start": 4, "conc-end": 8},
-          {"tp": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      }
-    ]
-  },
-  "gptoss-fp4-mi355x-vllm": {
-    "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
-    "model": "openai/gpt-oss-120b",
-    "runner": "mi355x",
-    "precision": "fp4",
-    "framework": "vllm",
-    "seq-len-configs": [
-      {
-        "isl": 1024,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 16},
-          {"tp": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      },
-      {
-        "isl": 1024,
-        "osl": 8192,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 16},
-          {"tp": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      },
-      {
-        "isl": 8192,
-        "osl": 1024,
-        "bmk-space": [
-          {"tp": 1, "conc-start": 4, "conc-end": 64},
-          {"tp": 4, "conc-start": 4, "conc-end": 16},
-          {"tp": 8, "conc-start": 4, "conc-end": 16}
-        ]
-      }
-    ]
-  }
-}
\ No newline at end of file
diff --git a/.github/configs/master.yaml b/.github/configs/master.yaml
new file mode 100644
index 000000000..e83df34c0
--- /dev/null
+++ b/.github/configs/master.yaml
@@ -0,0 +1,784 @@
+70b-fp4-b200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: nvidia/Llama-3.3-70B-Instruct-FP4
+  runner: b200-trt
+  precision: fp4
+  framework: trt
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 128, conc-end: 128 }
+    - { tp: 2, conc-start: 64, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 128, conc-end: 128 }
+    - { tp: 2, conc-start: 64, conc-end: 128 }
+    - { tp: 4, conc-start: 16, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 128 }
+    - { tp: 2, conc-start: 16, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+
+70b-fp4-b200-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: nvidia/Llama-3.3-70B-Instruct-FP4
+  runner: b200
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 16, conc-end: 64 }
+    - { tp: 2, conc-start: 16, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 8, conc-start: 4, conc-end: 8 }
+
+70b-fp4-mi355x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+
+70b-fp8-b200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: nvidia/Llama-3.3-70B-Instruct-FP8
+  runner: b200-trt
+  precision: fp8
+  framework: trt
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 128, conc-end: 128 }
+    - { tp: 2, conc-start: 64, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 128, conc-end: 128 }
+    - { tp: 2, conc-start: 64, conc-end: 128 }
+    - { tp: 4, conc-start: 16, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 128 }
+    - { tp: 2, conc-start: 16, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+
+70b-fp8-b200-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: nvidia/Llama-3.3-70B-Instruct-FP8
+  runner: b200
+  precision: fp8
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 64, conc-end: 64 }
+    - { tp: 4, conc-start: 16, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 16, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+
+70b-fp8-h100-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: nvidia/Llama-3.3-70B-Instruct-FP8
+  runner: h100
+  precision: fp8
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 2, conc-start: 64, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 2, conc-start: 64, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+70b-fp8-h200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: nvidia/Llama-3.3-70B-Instruct-FP8
+  runner: h200-trt
+  precision: fp8
+  framework: trt
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 128, conc-end: 128 }
+    - { tp: 2, conc-start: 64, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 128, conc-end: 128 }
+    - { tp: 2, conc-start: 64, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 16, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+
+70b-fp8-h200-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: nvidia/Llama-3.3-70B-Instruct-FP8
+  runner: h200
+  precision: fp8
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 64, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 16, conc-end: 64 }
+    - { tp: 2, conc-start: 16, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+70b-fp8-mi300x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: amd/Llama-3.3-70B-Instruct-FP8-KV
+  runner: mi300x
+  precision: fp8
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 64, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+70b-fp8-mi325x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: amd/Llama-3.3-70B-Instruct-FP8-KV
+  runner: mi325x
+  precision: fp8
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 32, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 64, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 16, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 32 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+70b-fp8-mi355x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: amd/Llama-3.3-70B-Instruct-FP8-KV
+  runner: mi355x
+  precision: fp8
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp4-b200-sgl:
+  image: lmsysorg/sglang:v0.5.3rc1-cu129-b200
+  model: nvidia/DeepSeek-R1-0528-FP4-V2
+  runner: b200
+  precision: fp4
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
+
+dsr1-fp4-b200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: nvidia/DeepSeek-R1-0528-FP4-V2
+  runner: b200-trt
+  precision: fp4
+  framework: trt
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    # If TP=4, 
+    #   If CONC > 32, then EP=4
+    #   If CONC >= 256, DP_ATTN=true
+    - { tp: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 }
+    # If TP=8, 
+    #   If CONC > 8, then EP=8
+    #   If CONC >= 256, DP_ATTN=true
+    - { tp: 8, conc-start: 4, conc-end: 8 }
+    - { tp: 8, ep: 8, conc-start: 16, conc-end: 128 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    # If TP=4, 
+    #   If CONC > 32, then EP=4
+    #   If CONC >= 256, DP_ATTN=true
+    - { tp: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 }
+    # If TP=8, 
+    #   If CONC > 16, then EP=8
+    #   If CONC >= 256, DP_ATTN=true
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+    - { tp: 8, ep: 8, conc-start: 32, conc-end: 128 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    # If TP=4, 
+    #   If CONC > 32, then EP=4 and DP_ATTN=true
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 }
+    # If TP=8, 
+    #   If CONC > 32, then EP=8 and DP_ATTN=true
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
+
+dsr1-fp4-mi355x-sgl:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915
+  model: amd/DeepSeek-R1-0528-MXFP4-Preview
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-b200-sgl:
+  image: lmsysorg/sglang:v0.5.3rc1-cu129-b200
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: b200
+  precision: fp8
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-b200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: b200-trt
+  precision: fp8
+  framework: trt
+  seq-len-configs:
+  # For all sequence lengths, EP=TP
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    # If CONC > 32, then DP_ATTN=true
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    # If CONC > 64, then DP_ATTN=true
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    # If CONC > 64, then DP_ATTN=true
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-h200-sgl:
+  image: lmsysorg/sglang:v0.5.2rc2-cu126
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: h200
+  precision: fp8
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-h200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: h200-trt
+  precision: fp8
+  framework: trt
+  # For all sequence lengths, EP=TP
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    # If CONC > 64, then DP_ATTN=true
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    # If CONC > 64, then DP_ATTN=true
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    # If CONC > 32, then DP_ATTN=true
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 }
+
+dsr1-fp8-mi300x-sgl:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: mi300x
+  precision: fp8
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-mi325x-sgl:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: mi325x
+  precision: fp8
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-mi355x-sgl:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+gptoss-fp4-b200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1
+  model: openai/gpt-oss-120b
+  runner: b200-trt
+  precision: fp4
+  framework: trt
+  # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+
+gptoss-fp4-b200-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: openai/gpt-oss-120b
+  runner: b200
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+
+gptoss-fp4-h100-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: openai/gpt-oss-120b
+  runner: h100
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+
+gptoss-fp4-h200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: openai/gpt-oss-120b
+  runner: h200-trt
+  precision: fp4
+  framework: trt
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+
+gptoss-fp4-h200-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: openai/gpt-oss-120b
+  runner: h200
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 16 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+
+gptoss-fp4-mi300x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: openai/gpt-oss-120b
+  runner: mi300x
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
+
+gptoss-fp4-mi325x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: openai/gpt-oss-120b
+  runner: mi325x
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 64, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 8 }
+    - { tp: 4, conc-start: 4, conc-end: 8 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+
+gptoss-fp4-mi355x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: openai/gpt-oss-120b
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 16 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 16 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 16 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index d13584078..ababfa150 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -13,19 +13,18 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
-# EP_SIZE
 # DP_ATTENTION
+# EP_SIZE
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-# Default
-MOE_BACKEND="TRTLLM"
-
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 hf download $MODEL
 
-# ========= Determine MOE_BACKEND based on ISL, OSL, CONC =========
+# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
+MOE_BACKEND="TRTLLM"
+
 if [[ "$TP" == "4" ]]; then
     if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
         if [[ $CONC -ge 256 ]]; then
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index 6bc8c9fa7..509cca7ba 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -13,17 +13,20 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
-# EP_SIZE
 # DP_ATTENTION
+# EP_SIZE
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-MOE_BACKEND="DEEPGEMM"
-
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 hf download $MODEL
 
+# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
+MOE_BACKEND="DEEPGEMM"
+
+echo "MOE_BACKEND set to '$MOE_BACKEND'"
+
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 EXTRA_CONFIG_FILE="dsr1-fp8.yml"
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 5dfdf8617..174d67b53 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -13,17 +13,20 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
-# EP_SIZE
 # DP_ATTENTION
+# EP_SIZE
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-MOE_BACKEND="CUTLASS"
-
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND: $MOE_BACKEND"
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 hf download $MODEL
 
+# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
+MOE_BACKEND="CUTLASS"
+
+echo "MOE_BACKEND set to '$MOE_BACKEND'"
+
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 EXTRA_CONFIG_FILE="dsr1-fp8.yml"
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index 6b2f251dd..349930dfb 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -13,31 +13,24 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
-# EP_SIZE
 # DP_ATTENTION
+# EP_SIZE
 
 # GPTOSS TRTLLM Deployment Guide:
 # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-MOE_BACKEND="TRTLLM"
-
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, DP_ATTENTION: $DP_ATTENTION, MOE_BACKEND: $MOE_BACKEND"
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
 
 hf download $MODEL
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 
 # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
+MOE_BACKEND="TRTLLM"
 
-# Higher concurrencies: Concurrency >= 256
-#   MoE Backend = CUTLASS
-#   Use DP attention with expert parallel MoE
-if [[ $CONC -ge 256 ]]; then
-    EP_SIZE="$TP"
-    DP_ATTENTION=true
-fi
+echo "MOE_BACKEND set to '$MOE_BACKEND'"
 
 EXTRA_CONFIG_FILE="gptoss-fp4.yml"
 export TRTLLM_ENABLE_PDL=1
diff --git a/utils/get_configs.py b/utils/get_configs.py
index 7aec991b0..24c6ea8a3 100644
--- a/utils/get_configs.py
+++ b/utils/get_configs.py
@@ -1,4 +1,5 @@
 import json
+import yaml
 import sys
 
 seq_len_stoi = {
@@ -23,7 +24,7 @@ def main():
     
     try:
         with open(config_file, 'r') as f:
-            config_data = json.load(f)
+            config_data = yaml.safe_load(f)
             assert isinstance(config_data, dict)
     except FileNotFoundError:
         raise ValueError(f"Input file '{config_file}' does not exist.")

From 098748283eca956be8d7255a42f006fbff31475a Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 12:14:38 -0500
Subject: [PATCH 019/149] updating the benchmark files with logic

---
 .github/workflows/1k1k-sweep.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 2e6c3cffa..9b1c10aae 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -15,7 +15,7 @@ jobs:
 
       - id: get-70b-configs
         run: |
-          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k 70b)
+          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.yaml 1k1k 70b)
           echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
   get-dsr1-configs:
@@ -28,7 +28,7 @@ jobs:
 
       - id: get-dsr1-configs
         run: |
-          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k dsr1)
+          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.yaml 1k1k dsr1)
           echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
   get-gptoss-configs:
@@ -41,7 +41,7 @@ jobs:
 
       - id: get-gptoss-configs
         run: |
-          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.json 1k1k gptoss)
+          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.yaml 1k1k gptoss)
           echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
   benchmark-70b:

From d9fd1910668bec01a9632588e4c74caa6df77fb7 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 12:21:36 -0500
Subject: [PATCH 020/149] updating the benchmark files with logic

---
 .github/configs/master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/master.yaml b/.github/configs/master.yaml
index e83df34c0..e085deb3f 100644
--- a/.github/configs/master.yaml
+++ b/.github/configs/master.yaml
@@ -559,7 +559,7 @@ dsr1-fp8-mi355x-sgl:
 gptoss-fp4-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1
   model: openai/gpt-oss-120b
-  runner: b200-trt
+  runner: b200-nvs
   precision: fp4
   framework: trt
   # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true

From 78f6b8d2d0e20610fba59b43a5129606cbd1111e Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 12:37:08 -0500
Subject: [PATCH 021/149] updating the benchmark files with logic

---
 .github/configs/master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/master.yaml b/.github/configs/master.yaml
index e085deb3f..b5743fe51 100644
--- a/.github/configs/master.yaml
+++ b/.github/configs/master.yaml
@@ -642,7 +642,7 @@ gptoss-fp4-h100-vllm:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
 
 gptoss-fp4-h200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev
   model: openai/gpt-oss-120b
   runner: h200-trt
   precision: fp4

From d808413f42ebd9c384901f6fea46e0cfa59d797b Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 13:36:08 -0500
Subject: [PATCH 022/149] updating the benchmark files with logic

---
 .github/workflows/1k1k-sweep.yml | 28 +++++++++++++++++++++++-----
 utils/summarize.py               |  8 +++++---
 2 files changed, 28 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 9b1c10aae..129bad2af 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -110,8 +110,26 @@ jobs:
       tp: ${{ matrix.config.tp }}
       conc: ${{ matrix.config.conc }}
 
-#   collect-results:
-#     needs: [benchmark-70b, benchmark-dsr1, benchmark-gptoss]
-#     uses: ./.github/workflows/collect-results.yml
-#     with:
-#       exp-name: 1k1k
\ No newline at end of file
+  collect-70b-results:
+    needs: benchmark-70b
+    if: ${{ always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_1k1k'
+
+  collect-dsr1-results:
+    needs: benchmark-dsr1
+    if: ${{ always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: 'dsr1_1k1k'
+
+  collect-gptoss-results:
+    needs: benchmark-gptoss
+    if: ${{ always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: 'gptoss_1k1k'
\ No newline at end of file
diff --git a/utils/summarize.py b/utils/summarize.py
index 1f78caf9c..de8863c78 100644
--- a/utils/summarize.py
+++ b/utils/summarize.py
@@ -9,11 +9,11 @@
     with open(result_path) as f:
         result = json.load(f)
     results.append(result)
-results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['conc']))
+results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['ep'], r['conc']))
 
 summary_header = f'''\
-| Hardware | Framework | Precision | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
-| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\
+| Hardware | Framework | Precision | TP | EP | Conc | DP Attention | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
+| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\
 '''
 print(summary_header)
 
@@ -25,7 +25,9 @@
         f"| {framework.upper()} "
         f"| {precision.upper()} "
         f"| {result['tp']} "
+        f"| {result['ep']} "
         f"| {result['conc']} "
+        f"| {result['dp_attention']} "
         f"| {(result['median_ttft'] * 1000):.4f} "
         f"| {(result['median_tpot'] * 1000):.4f} "
         f"| {result['median_e2el']:.4f} "

From bc24be4bc587f72bca15d50c1446e50016cbc433 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 18:32:00 -0500
Subject: [PATCH 023/149] updating the benchmark files with logic

---
 .github/workflows/1k1k-sweep.yml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 129bad2af..3341b1b50 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -64,6 +64,8 @@ jobs:
       framework: ${{ matrix.config.framework }}
       precision: ${{ matrix.config.precision }}
       tp: ${{ matrix.config.tp }}
+      ep: ${{ matrix.config.ep }}
+      dp-attn: ${{ matrix.config.dp-attn }}
       conc: ${{ matrix.config.conc }}
 
   benchmark-dsr1:
@@ -86,6 +88,8 @@ jobs:
       framework: ${{ matrix.config.framework }}
       precision: ${{ matrix.config.precision }}
       tp: ${{ matrix.config.tp }}
+      ep: ${{ matrix.config.ep }}
+      dp-attn: ${{ matrix.config.dp-attn }}
       conc: ${{ matrix.config.conc }}
 
   benchmark-gptoss:
@@ -108,6 +112,8 @@ jobs:
       framework: ${{ matrix.config.framework }}
       precision: ${{ matrix.config.precision }}
       tp: ${{ matrix.config.tp }}
+      ep: ${{ matrix.config.ep }}
+      dp-attn: ${{ matrix.config.dp-attn }}
       conc: ${{ matrix.config.conc }}
 
   collect-70b-results:

From 7479f743f52a962706b10f9d51d8329afc5e0904 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 18:49:34 -0500
Subject: [PATCH 024/149] testing concurrency


From 93fba3b8dbc302e8443aac0f6b0c7a25296773e9 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 18:54:00 -0500
Subject: [PATCH 025/149] updating the benchmark files with logic

---
 .github/workflows/1k1k-sweep.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 3341b1b50..313470c31 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -64,8 +64,8 @@ jobs:
       framework: ${{ matrix.config.framework }}
       precision: ${{ matrix.config.precision }}
       tp: ${{ matrix.config.tp }}
-      ep: ${{ matrix.config.ep }}
-      dp-attn: ${{ matrix.config.dp-attn }}
+      ep: ${{ matrix.config.ep || 1 }}
+      dp-attn: ${{ matrix.config.dp-attn || false }}
       conc: ${{ matrix.config.conc }}
 
   benchmark-dsr1:
@@ -88,8 +88,8 @@ jobs:
       framework: ${{ matrix.config.framework }}
       precision: ${{ matrix.config.precision }}
       tp: ${{ matrix.config.tp }}
-      ep: ${{ matrix.config.ep }}
-      dp-attn: ${{ matrix.config.dp-attn }}
+      ep: ${{ matrix.config.ep || 1 }}
+      dp-attn: ${{ matrix.config.dp-attn || false }}
       conc: ${{ matrix.config.conc }}
 
   benchmark-gptoss:
@@ -112,8 +112,8 @@ jobs:
       framework: ${{ matrix.config.framework }}
       precision: ${{ matrix.config.precision }}
       tp: ${{ matrix.config.tp }}
-      ep: ${{ matrix.config.ep }}
-      dp-attn: ${{ matrix.config.dp-attn }}
+      ep: ${{ matrix.config.ep || 1 }}
+      dp-attn: ${{ matrix.config.dp-attn || false }}
       conc: ${{ matrix.config.conc }}
 
   collect-70b-results:

From d021eb3627b71f771d51efde5f723545898db81e Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 09:06:50 -0500
Subject: [PATCH 026/149] updating the benchmark files with logic

---
 .github/configs/amd-master.yaml      | 280 +++++++++++++++
 .github/configs/nvidia-master.yaml   | 503 +++++++++++++++++++++++++++
 .github/workflows/1k1k-sweep.yml     | 252 +++++++-------
 .github/workflows/benchmark-tmpl.yml |  36 +-
 utils/get_configs.py                 |  73 ++--
 5 files changed, 990 insertions(+), 154 deletions(-)
 create mode 100644 .github/configs/amd-master.yaml
 create mode 100644 .github/configs/nvidia-master.yaml

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
new file mode 100644
index 000000000..2465ee5b6
--- /dev/null
+++ b/.github/configs/amd-master.yaml
@@ -0,0 +1,280 @@
+70b-fp4-mi355x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+
+70b-fp8-mi300x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: amd/Llama-3.3-70B-Instruct-FP8-KV
+  runner: mi300x
+  precision: fp8
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 64, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+70b-fp8-mi325x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: amd/Llama-3.3-70B-Instruct-FP8-KV
+  runner: mi325x
+  precision: fp8
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 32, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 64, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 16, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 32 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+70b-fp8-mi355x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: amd/Llama-3.3-70B-Instruct-FP8-KV
+  runner: mi355x
+  precision: fp8
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp4-mi355x-sgl:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915
+  model: amd/DeepSeek-R1-0528-MXFP4-Preview
+  runner: mi355x
+  precision: fp4
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-mi300x-sgl:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: mi300x
+  precision: fp8
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-mi325x-sgl:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: mi325x
+  precision: fp8
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-mi355x-sgl:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: mi355x
+  precision: fp8
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+gptoss-fp4-mi300x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: openai/gpt-oss-120b
+  runner: mi300x
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
+
+gptoss-fp4-mi325x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: openai/gpt-oss-120b
+  runner: mi325x
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 64, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 8 }
+    - { tp: 4, conc-start: 4, conc-end: 8 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+
+gptoss-fp4-mi355x-vllm:
+  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
+  model: openai/gpt-oss-120b
+  runner: mi355x
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 16 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 16 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 16 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
new file mode 100644
index 000000000..9ac3fbcf3
--- /dev/null
+++ b/.github/configs/nvidia-master.yaml
@@ -0,0 +1,503 @@
+70b-fp4-b200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: nvidia/Llama-3.3-70B-Instruct-FP4
+  runner: b200-trt
+  precision: fp4
+  framework: trt
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 128, conc-end: 128 }
+    - { tp: 2, conc-start: 64, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 128, conc-end: 128 }
+    - { tp: 2, conc-start: 64, conc-end: 128 }
+    - { tp: 4, conc-start: 16, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 128 }
+    - { tp: 2, conc-start: 16, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+
+70b-fp4-b200-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: nvidia/Llama-3.3-70B-Instruct-FP4
+  runner: b200
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 16, conc-end: 64 }
+    - { tp: 2, conc-start: 16, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 8, conc-start: 4, conc-end: 8 }
+
+70b-fp8-b200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: nvidia/Llama-3.3-70B-Instruct-FP8
+  runner: b200-trt
+  precision: fp8
+  framework: trt
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 128, conc-end: 128 }
+    - { tp: 2, conc-start: 64, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 128, conc-end: 128 }
+    - { tp: 2, conc-start: 64, conc-end: 128 }
+    - { tp: 4, conc-start: 16, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 128 }
+    - { tp: 2, conc-start: 16, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+
+70b-fp8-b200-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: nvidia/Llama-3.3-70B-Instruct-FP8
+  runner: b200
+  precision: fp8
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 64, conc-end: 64 }
+    - { tp: 4, conc-start: 16, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, conc-start: 16, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+
+70b-fp8-h100-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: nvidia/Llama-3.3-70B-Instruct-FP8
+  runner: h100
+  precision: fp8
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 2, conc-start: 64, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 2, conc-start: 64, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+70b-fp8-h200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: nvidia/Llama-3.3-70B-Instruct-FP8
+  runner: h200-trt
+  precision: fp8
+  framework: trt
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 128, conc-end: 128 }
+    - { tp: 2, conc-start: 64, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 128, conc-end: 128 }
+    - { tp: 2, conc-start: 64, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 16, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+
+70b-fp8-h200-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: nvidia/Llama-3.3-70B-Instruct-FP8
+  runner: h200
+  precision: fp8
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 32, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 64, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, conc-start: 16, conc-end: 64 }
+    - { tp: 2, conc-start: 16, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp4-b200-sgl:
+  image: lmsysorg/sglang:v0.5.3rc1-cu129-b200
+  model: nvidia/DeepSeek-R1-0528-FP4-V2
+  runner: b200
+  precision: fp4
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
+
+dsr1-fp4-b200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: nvidia/DeepSeek-R1-0528-FP4-V2
+  runner: b200-trt
+  precision: fp4
+  framework: trt
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    # If TP=4, 
+    #   If CONC > 32, then EP=4
+    #   If CONC >= 256, DP_ATTN=true
+    - { tp: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 }
+    # If TP=8, 
+    #   If CONC > 8, then EP=8
+    #   If CONC >= 256, DP_ATTN=true
+    - { tp: 8, conc-start: 4, conc-end: 8 }
+    - { tp: 8, ep: 8, conc-start: 16, conc-end: 128 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    # If TP=4, 
+    #   If CONC > 32, then EP=4
+    #   If CONC >= 256, DP_ATTN=true
+    - { tp: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 }
+    # If TP=8, 
+    #   If CONC > 16, then EP=8
+    #   If CONC >= 256, DP_ATTN=true
+    - { tp: 8, conc-start: 4, conc-end: 16 }
+    - { tp: 8, ep: 8, conc-start: 32, conc-end: 128 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    # If TP=4, 
+    #   If CONC > 32, then EP=4 and DP_ATTN=true
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 }
+    # If TP=8, 
+    #   If CONC > 32, then EP=8 and DP_ATTN=true
+    - { tp: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
+
+dsr1-fp8-b200-sgl:
+  image: lmsysorg/sglang:v0.5.3rc1-cu129-b200
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: b200
+  precision: fp8
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-b200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: b200-trt
+  precision: fp8
+  framework: trt
+  seq-len-configs:
+  # For all sequence lengths, EP=TP
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    # If CONC > 32, then DP_ATTN=true
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    # If CONC > 64, then DP_ATTN=true
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    # If CONC > 64, then DP_ATTN=true
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-h200-sgl:
+  image: lmsysorg/sglang:v0.5.2rc2-cu126
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: h200
+  precision: fp8
+  framework: sglang
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+
+dsr1-fp8-h200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
+  model: deepseek-ai/DeepSeek-R1-0528
+  runner: h200-trt
+  precision: fp8
+  framework: trt
+  # For all sequence lengths, EP=TP
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    # If CONC > 64, then DP_ATTN=true
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    # If CONC > 64, then DP_ATTN=true
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    # If CONC > 32, then DP_ATTN=true
+    bmk-space:
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 }
+
+gptoss-fp4-b200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1
+  model: openai/gpt-oss-120b
+  runner: b200-nvs
+  precision: fp4
+  framework: trt
+  # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+
+gptoss-fp4-b200-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: openai/gpt-oss-120b
+  runner: b200
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+
+gptoss-fp4-h100-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: openai/gpt-oss-120b
+  runner: h100
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+
+gptoss-fp4-h200-trt:
+  image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev
+  model: openai/gpt-oss-120b
+  runner: h200-trt
+  precision: fp4
+  framework: trt
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 32, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+
+gptoss-fp4-h200-vllm:
+  image: vllm/vllm-openai:v0.10.2
+  model: openai/gpt-oss-120b
+  runner: h200
+  precision: fp4
+  framework: vllm
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 1024
+    osl: 8192
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 16 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+  - isl: 8192
+    osl: 1024
+    bmk-space:
+    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 313470c31..04b0b9b86 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -1,141 +1,141 @@
-name: '1K/1K Sweep'
+name: "1K/1K Sweep"
 
 on:
-  pull_request:
-  workflow_dispatch:
+    pull_request:
+    workflow_dispatch:
 
 jobs:
-  get-70b-configs:
-    runs-on: ubuntu-latest
-    outputs:
-        search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
+    get-70b-configs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
 
-      - id: get-70b-configs
-        run: |
-          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.yaml 1k1k 70b)
-          echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+            - id: get-70b-configs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b)
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
-  get-dsr1-configs:
-    runs-on: ubuntu-latest
-    outputs:
-        search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
+    get-dsr1-configs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
 
-      - id: get-dsr1-configs
-        run: |
-          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.yaml 1k1k dsr1)
-          echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+            - id: get-dsr1-configs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1)
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
-  get-gptoss-configs:
-    runs-on: ubuntu-latest
-    outputs:
-        search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
+    get-gptoss-configs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
 
-      - id: get-gptoss-configs
-        run: |
-          CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py ${GITHUB_WORKSPACE}/.github/configs/master.yaml 1k1k gptoss)
-          echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+            - id: get-gptoss-configs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
-  benchmark-70b:
-    needs: get-70b-configs
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    name: 70b 1k1k
-    strategy:
-      fail-fast: false
-      matrix:
-        config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }}
-    secrets: inherit
-    with:
-      exp-name: "70b_1k1k"
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      runner: ${{ matrix.config.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      tp: ${{ matrix.config.tp }}
-      ep: ${{ matrix.config.ep || 1 }}
-      dp-attn: ${{ matrix.config.dp-attn || false }}
-      conc: ${{ matrix.config.conc }}
+    benchmark-70b:
+        needs: get-70b-configs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: 70b 1k1k
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "70b_1k1k"
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
 
-  benchmark-dsr1:
-    needs: get-dsr1-configs
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    name: dsr1 1k1k
-    strategy:
-      fail-fast: false
-      matrix:
-        config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }}
-    secrets: inherit
-    with:
-      exp-name: "dsr1_1k1k"
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      runner: ${{ matrix.config.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      tp: ${{ matrix.config.tp }}
-      ep: ${{ matrix.config.ep || 1 }}
-      dp-attn: ${{ matrix.config.dp-attn || false }}
-      conc: ${{ matrix.config.conc }}
+    benchmark-dsr1:
+        needs: get-dsr1-configs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: dsr1 1k1k
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k1k"
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
 
-  benchmark-gptoss:
-    needs: get-gptoss-configs
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    name: gptoss 1k1k
-    strategy:
-      fail-fast: false
-      matrix:
-        config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }}
-    secrets: inherit
-    with:
-      exp-name: "gptoss_1k1k"
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      runner: ${{ matrix.config.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      tp: ${{ matrix.config.tp }}
-      ep: ${{ matrix.config.ep || 1 }}
-      dp-attn: ${{ matrix.config.dp-attn || false }}
-      conc: ${{ matrix.config.conc }}
+    benchmark-gptoss:
+        needs: get-gptoss-configs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: gptoss 1k1k
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "gptoss_1k1k"
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
 
-  collect-70b-results:
-    needs: benchmark-70b
-    if: ${{ always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k1k'
+    collect-70b-results:
+        needs: benchmark-70b
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "70b_1k1k"
 
-  collect-dsr1-results:
-    needs: benchmark-dsr1
-    if: ${{ always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k1k'
+    collect-dsr1-results:
+        needs: benchmark-dsr1
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k1k"
 
-  collect-gptoss-results:
-    needs: benchmark-gptoss
-    if: ${{ always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_1k1k'
\ No newline at end of file
+    collect-gptoss-results:
+        needs: benchmark-gptoss
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_1k1k"
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 6cfb692fe..e4ec98314 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -74,13 +74,35 @@ jobs:
       - name: Resource cleanup
         run: |
           if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
-            echo "[Docker] Cleaning up resources ..."
-            docker ps -aq | xargs -r docker rm -f
-            docker network prune -f
-            while [ -n "$(docker ps -aq)" ]; do
-              docker ps -a
-              sleep 5
-            done
+            host=$(hostname)
+            if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then
+              echo "[INFO] Running container-by-container cleanup on $host"
+              for cid in $(docker ps -aq); do
+                echo "[INFO] Cleaning container $cid"
+                # Try graceful first
+                docker stop -t 90 "$cid" || true
+                # Wait until it's really dead
+                docker wait "$cid" >/dev/null 2>&1 || true
+                # Force remove if anything lingers
+                docker rm -f "$cid" >/dev/null 2>&1 || true
+              done
+              # Give a moment for GPU processes to fully terminate
+              sleep 2
+              # Verify GPUs are now idle
+              if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then
+                echo "[WARN] After stop, GPU still busy:"
+                nvidia-smi
+                # Last resort if driver allows and GPUs appear idle otherwise:
+                # nvidia-smi --gpu-reset -i 0,1,2,3,4,5,6,7 2>/dev/null || true
+              fi
+            else
+              echo "[Docker] Cleaning up resources ..."
+              docker ps -aq | xargs -r docker rm -f
+              docker network prune -f
+              while [ -n "$(docker ps -aq)" ]; do
+                docker ps -a
+                sleep 5
+              done
           fi
           if command -v squeue >/dev/null 2>&1; then
             echo "[Slurm] Cleaning up resources ..."
diff --git a/utils/get_configs.py b/utils/get_configs.py
index 24c6ea8a3..01e13f313 100644
--- a/utils/get_configs.py
+++ b/utils/get_configs.py
@@ -1,6 +1,7 @@
 import json
 import yaml
 import sys
+import argparse
 
 seq_len_stoi = {
     "1k1k": (1024, 1024),
@@ -9,30 +10,60 @@
 }
 
 def main():
-    if len(sys.argv) < 4:
-        print(f"Usage: python3 {sys.argv[0]} {{config-file}} {{isl-osl}} {{model-prefix}} [step-size]")
-        exit(1)
-
-    config_file = sys.argv[1]
-    seq_len = sys.argv[2]
-    model_prefix = sys.argv[3]
-    step_size = int(sys.argv[4]) if len(sys.argv) > 4 else 2
+    parser = argparse.ArgumentParser(
+        description='Generate benchmark matrix from configuration files'
+    )
+    parser.add_argument(
+        '--config-files',
+        nargs='+',
+        required=True,
+        help='One or more configuration files (YAML format)'
+    )
+    parser.add_argument(
+        '--seq-lens',
+        choices=list(seq_len_stoi.keys()),
+        required=True,
+        help=f"Sequence length configuration: {', '.join(seq_len_stoi.keys())}"
+    )
+    parser.add_argument(
+        '--model-prefix',
+        required=True,
+        help='Model prefix to filter configurations'
+    )
+    parser.add_argument(
+        '--step-size',
+        type=int,
+        default=2,
+        help='Step size for concurrency values (default: 2)'
+    )
+    
+    args = parser.parse_args()
     
-    isl, osl = seq_len_stoi.get(seq_len) or (None, None)
-    if not (isl or osl):
-        raise ValueError(f"Input 'isl-osl' must be one of '{', '.join(seq_len_stoi.keys())}'.")
+    isl, osl = seq_len_stoi[args.seq_lens]
     
-    try:
-        with open(config_file, 'r') as f:
-            config_data = yaml.safe_load(f)
-            assert isinstance(config_data, dict)
-    except FileNotFoundError:
-        raise ValueError(f"Input file '{config_file}' does not exist.")
+    all_config_data = {}
+    for config_file in args.config_files:
+        try:
+            with open(config_file, 'r') as f:
+                config_data = yaml.safe_load(f)
+                assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary"
+                
+                # Check for duplicate keys, shouldn't really be an issue but with NVIDIA and AMD 
+                # separate configs this will help against any possible confusion
+                duplicate_keys = set(all_config_data.keys()) & set(config_data.keys())
+                if duplicate_keys:
+                    raise ValueError(
+                        f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
+                    )
+                
+                all_config_data.update(config_data)
+        except FileNotFoundError:
+            raise ValueError(f"Input file '{config_file}' does not exist.")
     
     matrix_values = []
-    for key, val in config_data.items():
-        # Filter by model prefix
-        if not key.startswith(model_prefix):
+    for key, val in all_config_data.items():
+        # Filter by model prefix i.e., 
+        if not key.startswith(args.model_prefix):
             continue
 
         seq_len_configs = val.get('seq-len-configs')
@@ -95,7 +126,7 @@ def main():
                 
                 if conc == conc_end:
                     break
-                conc *= step_size
+                conc *= args.step_size
                 if conc > conc_end:
                     conc = conc_end 
     

From 09ebb8a8887b18e2fef98edbbbbf09e3c4743763 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 09:08:08 -0500
Subject: [PATCH 027/149] updating the benchmark files with logic

---
 .github/workflows/benchmark-tmpl.yml | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index e4ec98314..4f8468a82 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -75,19 +75,26 @@ jobs:
         run: |
           if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
             host=$(hostname)
+
             if [[ "$host" == "b200-81" || "$host" == "b200-80" || "$host" == "b200-79" ]]; then
               echo "[INFO] Running container-by-container cleanup on $host"
+
               for cid in $(docker ps -aq); do
                 echo "[INFO] Cleaning container $cid"
+
                 # Try graceful first
                 docker stop -t 90 "$cid" || true
+
                 # Wait until it's really dead
                 docker wait "$cid" >/dev/null 2>&1 || true
+
                 # Force remove if anything lingers
                 docker rm -f "$cid" >/dev/null 2>&1 || true
               done
+
               # Give a moment for GPU processes to fully terminate
               sleep 2
+
               # Verify GPUs are now idle
               if nvidia-smi --query-compute-apps=pid --format=csv,noheader | grep -q '[0-9]'; then
                 echo "[WARN] After stop, GPU still busy:"
@@ -103,6 +110,7 @@ jobs:
                 docker ps -a
                 sleep 5
               done
+            fi
           fi
           if command -v squeue >/dev/null 2>&1; then
             echo "[Slurm] Cleaning up resources ..."

From 6c61ba9ff059f4ad56ce67e3bdaad9ebfe3aafa8 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 09:11:04 -0500
Subject: [PATCH 028/149] updating the benchmark files with logic

---
 .github/configs/master.yaml      | 784 -------------------------------
 .github/workflows/1k1k-sweep.yml | 120 ++---
 2 files changed, 60 insertions(+), 844 deletions(-)
 delete mode 100644 .github/configs/master.yaml

diff --git a/.github/configs/master.yaml b/.github/configs/master.yaml
deleted file mode 100644
index b5743fe51..000000000
--- a/.github/configs/master.yaml
+++ /dev/null
@@ -1,784 +0,0 @@
-70b-fp4-b200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
-  model: nvidia/Llama-3.3-70B-Instruct-FP4
-  runner: b200-trt
-  precision: fp4
-  framework: trt
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 128, conc-end: 128 }
-    - { tp: 2, conc-start: 64, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, conc-start: 128, conc-end: 128 }
-    - { tp: 2, conc-start: 64, conc-end: 128 }
-    - { tp: 4, conc-start: 16, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 32, conc-end: 128 }
-    - { tp: 2, conc-start: 16, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-
-70b-fp4-b200-vllm:
-  image: vllm/vllm-openai:v0.10.2
-  model: nvidia/Llama-3.3-70B-Instruct-FP4
-  runner: b200
-  precision: fp4
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 16, conc-end: 64 }
-    - { tp: 2, conc-start: 16, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 32 }
-    - { tp: 8, conc-start: 4, conc-end: 8 }
-
-70b-fp4-mi355x-vllm:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
-  model: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-
-70b-fp8-b200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
-  model: nvidia/Llama-3.3-70B-Instruct-FP8
-  runner: b200-trt
-  precision: fp8
-  framework: trt
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 128, conc-end: 128 }
-    - { tp: 2, conc-start: 64, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, conc-start: 128, conc-end: 128 }
-    - { tp: 2, conc-start: 64, conc-end: 128 }
-    - { tp: 4, conc-start: 16, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 32, conc-end: 128 }
-    - { tp: 2, conc-start: 16, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-
-70b-fp8-b200-vllm:
-  image: vllm/vllm-openai:v0.10.2
-  model: nvidia/Llama-3.3-70B-Instruct-FP8
-  runner: b200
-  precision: fp8
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 64, conc-end: 64 }
-    - { tp: 4, conc-start: 16, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 16, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-
-70b-fp8-h100-vllm:
-  image: vllm/vllm-openai:v0.10.2
-  model: nvidia/Llama-3.3-70B-Instruct-FP8
-  runner: h100
-  precision: fp8
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 2, conc-start: 64, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 2, conc-start: 64, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
-70b-fp8-h200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
-  model: nvidia/Llama-3.3-70B-Instruct-FP8
-  runner: h200-trt
-  precision: fp8
-  framework: trt
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 128, conc-end: 128 }
-    - { tp: 2, conc-start: 64, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, conc-start: 128, conc-end: 128 }
-    - { tp: 2, conc-start: 64, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 16, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-
-70b-fp8-h200-vllm:
-  image: vllm/vllm-openai:v0.10.2
-  model: nvidia/Llama-3.3-70B-Instruct-FP8
-  runner: h200
-  precision: fp8
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 64, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 16, conc-end: 64 }
-    - { tp: 2, conc-start: 16, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
-70b-fp8-mi300x-vllm:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
-  model: amd/Llama-3.3-70B-Instruct-FP8-KV
-  runner: mi300x
-  precision: fp8
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 64, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
-70b-fp8-mi325x-vllm:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
-  model: amd/Llama-3.3-70B-Instruct-FP8-KV
-  runner: mi325x
-  precision: fp8
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 32, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 64, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 16, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 32 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
-70b-fp8-mi355x-vllm:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
-  model: amd/Llama-3.3-70B-Instruct-FP8-KV
-  runner: mi355x
-  precision: fp8
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
-dsr1-fp4-b200-sgl:
-  image: lmsysorg/sglang:v0.5.3rc1-cu129-b200
-  model: nvidia/DeepSeek-R1-0528-FP4-V2
-  runner: b200
-  precision: fp4
-  framework: sglang
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
-
-dsr1-fp4-b200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
-  model: nvidia/DeepSeek-R1-0528-FP4-V2
-  runner: b200-trt
-  precision: fp4
-  framework: trt
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    # If TP=4, 
-    #   If CONC > 32, then EP=4
-    #   If CONC >= 256, DP_ATTN=true
-    - { tp: 4, conc-start: 4, conc-end: 32 }
-    - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 }
-    # If TP=8, 
-    #   If CONC > 8, then EP=8
-    #   If CONC >= 256, DP_ATTN=true
-    - { tp: 8, conc-start: 4, conc-end: 8 }
-    - { tp: 8, ep: 8, conc-start: 16, conc-end: 128 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    # If TP=4, 
-    #   If CONC > 32, then EP=4
-    #   If CONC >= 256, DP_ATTN=true
-    - { tp: 4, conc-start: 4, conc-end: 32 }
-    - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 }
-    # If TP=8, 
-    #   If CONC > 16, then EP=8
-    #   If CONC >= 256, DP_ATTN=true
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-    - { tp: 8, ep: 8, conc-start: 32, conc-end: 128 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    # If TP=4, 
-    #   If CONC > 32, then EP=4 and DP_ATTN=true
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 }
-    # If TP=8, 
-    #   If CONC > 32, then EP=8 and DP_ATTN=true
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
-
-dsr1-fp4-mi355x-sgl:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915
-  model: amd/DeepSeek-R1-0528-MXFP4-Preview
-  runner: mi355x
-  precision: fp4
-  framework: sglang
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
-dsr1-fp8-b200-sgl:
-  image: lmsysorg/sglang:v0.5.3rc1-cu129-b200
-  model: deepseek-ai/DeepSeek-R1-0528
-  runner: b200
-  precision: fp8
-  framework: sglang
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-
-dsr1-fp8-b200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
-  model: deepseek-ai/DeepSeek-R1-0528
-  runner: b200-trt
-  precision: fp8
-  framework: trt
-  seq-len-configs:
-  # For all sequence lengths, EP=TP
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    # If CONC > 32, then DP_ATTN=true
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    # If CONC > 64, then DP_ATTN=true
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    # If CONC > 64, then DP_ATTN=true
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-
-dsr1-fp8-h200-sgl:
-  image: lmsysorg/sglang:v0.5.2rc2-cu126
-  model: deepseek-ai/DeepSeek-R1-0528
-  runner: h200
-  precision: fp8
-  framework: sglang
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-
-dsr1-fp8-h200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
-  model: deepseek-ai/DeepSeek-R1-0528
-  runner: h200-trt
-  precision: fp8
-  framework: trt
-  # For all sequence lengths, EP=TP
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    # If CONC > 64, then DP_ATTN=true
-    bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    # If CONC > 64, then DP_ATTN=true
-    bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    # If CONC > 32, then DP_ATTN=true
-    bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 }
-
-dsr1-fp8-mi300x-sgl:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915
-  model: deepseek-ai/DeepSeek-R1-0528
-  runner: mi300x
-  precision: fp8
-  framework: sglang
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
-dsr1-fp8-mi325x-sgl:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915
-  model: deepseek-ai/DeepSeek-R1-0528
-  runner: mi325x
-  precision: fp8
-  framework: sglang
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
-dsr1-fp8-mi355x-sgl:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915
-  model: deepseek-ai/DeepSeek-R1-0528
-  runner: mi355x
-  precision: fp8
-  framework: sglang
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
-gptoss-fp4-b200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1
-  model: openai/gpt-oss-120b
-  runner: b200-nvs
-  precision: fp4
-  framework: trt
-  # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
-
-gptoss-fp4-b200-vllm:
-  image: vllm/vllm-openai:v0.10.2
-  model: openai/gpt-oss-120b
-  runner: b200
-  precision: fp4
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-
-gptoss-fp4-h100-vllm:
-  image: vllm/vllm-openai:v0.10.2
-  model: openai/gpt-oss-120b
-  runner: h100
-  precision: fp4
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
-
-gptoss-fp4-h200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev
-  model: openai/gpt-oss-120b
-  runner: h200-trt
-  precision: fp4
-  framework: trt
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-
-gptoss-fp4-h200-vllm:
-  image: vllm/vllm-openai:v0.10.2
-  model: openai/gpt-oss-120b
-  runner: h200
-  precision: fp4
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 16 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
-
-gptoss-fp4-mi300x-vllm:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
-  model: openai/gpt-oss-120b
-  runner: mi300x
-  precision: fp4
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
-
-gptoss-fp4-mi325x-vllm:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
-  model: openai/gpt-oss-120b
-  runner: mi325x
-  precision: fp4
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 64, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 8 }
-    - { tp: 4, conc-start: 4, conc-end: 8 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-
-gptoss-fp4-mi355x-vllm:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
-  model: openai/gpt-oss-120b
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 16 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 1024
-    osl: 8192
-    bmk-space:
-    - { tp: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 16 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 8192
-    osl: 1024
-    bmk-space:
-    - { tp: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 16 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 04b0b9b86..2d00fa924 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -44,53 +44,53 @@ jobs:
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
-    benchmark-70b:
-        needs: get-70b-configs
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: 70b 1k1k
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }}
-        secrets: inherit
-        with:
-            exp-name: "70b_1k1k"
-            isl: 1024
-            osl: 1024
-            max-model-len: 2048
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
-            conc: ${{ matrix.config.conc }}
+    # benchmark-70b:
+    #     needs: get-70b-configs
+    #     uses: ./.github/workflows/benchmark-tmpl.yml
+    #     name: 70b 1k1k
+    #     strategy:
+    #         fail-fast: false
+    #         matrix:
+    #             config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }}
+    #     secrets: inherit
+    #     with:
+    #         exp-name: "70b_1k1k"
+    #         isl: 1024
+    #         osl: 1024
+    #         max-model-len: 2048
+    #         runner: ${{ matrix.config.runner }}
+    #         image: ${{ matrix.config.image }}
+    #         model: ${{ matrix.config.model }}
+    #         framework: ${{ matrix.config.framework }}
+    #         precision: ${{ matrix.config.precision }}
+    #         tp: ${{ matrix.config.tp }}
+    #         ep: ${{ matrix.config.ep || 1 }}
+    #         dp-attn: ${{ matrix.config.dp-attn || false }}
+    #         conc: ${{ matrix.config.conc }}
 
-    benchmark-dsr1:
-        needs: get-dsr1-configs
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: dsr1 1k1k
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }}
-        secrets: inherit
-        with:
-            exp-name: "dsr1_1k1k"
-            isl: 1024
-            osl: 1024
-            max-model-len: 2048
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
-            conc: ${{ matrix.config.conc }}
+    # benchmark-dsr1:
+    #     needs: get-dsr1-configs
+    #     uses: ./.github/workflows/benchmark-tmpl.yml
+    #     name: dsr1 1k1k
+    #     strategy:
+    #         fail-fast: false
+    #         matrix:
+    #             config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }}
+    #     secrets: inherit
+    #     with:
+    #         exp-name: "dsr1_1k1k"
+    #         isl: 1024
+    #         osl: 1024
+    #         max-model-len: 2048
+    #         runner: ${{ matrix.config.runner }}
+    #         image: ${{ matrix.config.image }}
+    #         model: ${{ matrix.config.model }}
+    #         framework: ${{ matrix.config.framework }}
+    #         precision: ${{ matrix.config.precision }}
+    #         tp: ${{ matrix.config.tp }}
+    #         ep: ${{ matrix.config.ep || 1 }}
+    #         dp-attn: ${{ matrix.config.dp-attn || false }}
+    #         conc: ${{ matrix.config.conc }}
 
     benchmark-gptoss:
         needs: get-gptoss-configs
@@ -116,21 +116,21 @@ jobs:
             dp-attn: ${{ matrix.config.dp-attn || false }}
             conc: ${{ matrix.config.conc }}
 
-    collect-70b-results:
-        needs: benchmark-70b
-        if: ${{ always() }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "70b_1k1k"
+    # collect-70b-results:
+    #     needs: benchmark-70b
+    #     if: ${{ always() }}
+    #     uses: ./.github/workflows/collect-results.yml
+    #     secrets: inherit
+    #     with:
+    #         exp-name: "70b_1k1k"
 
-    collect-dsr1-results:
-        needs: benchmark-dsr1
-        if: ${{ always() }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "dsr1_1k1k"
+    # collect-dsr1-results:
+    #     needs: benchmark-dsr1
+    #     if: ${{ always() }}
+    #     uses: ./.github/workflows/collect-results.yml
+    #     secrets: inherit
+    #     with:
+    #         exp-name: "dsr1_1k1k"
 
     collect-gptoss-results:
         needs: benchmark-gptoss

From f7d83402720d991a283bca6edc14feb55f8272fa Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 11:33:05 -0500
Subject: [PATCH 029/149] updating the benchmark files with logic

---
 .github/configs/amd-master.yaml         |  32 +++----
 .github/configs/nvidia-master.yaml      | 114 ++++++++++++------------
 benchmarks/gptoss_fp4_h200_trt_slurm.sh |   7 +-
 3 files changed, 77 insertions(+), 76 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 2465ee5b6..a501ead63 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -205,24 +205,24 @@ gptoss-fp4-mi300x-vllm:
   - isl: 1024
     osl: 1024
     bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 1024
     osl: 8192
     bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 8192
     osl: 1024
     bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
+    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
 
 gptoss-fp4-mi325x-vllm:
   image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
@@ -234,10 +234,10 @@ gptoss-fp4-mi325x-vllm:
   - isl: 1024
     osl: 1024
     bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
     bmk-space:
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 9ac3fbcf3..5c006dc91 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -207,18 +207,18 @@ dsr1-fp4-b200-sgl:
   - isl: 1024
     osl: 1024
     bmk-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 128 }
   - isl: 1024
     osl: 8192
     bmk-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 }
+    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
     bmk-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
+    - { tp: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
 
 dsr1-fp4-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
@@ -279,15 +279,15 @@ dsr1-fp8-b200-sgl:
   - isl: 1024
     osl: 1024
     bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
     bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
 
 dsr1-fp8-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
@@ -324,15 +324,15 @@ dsr1-fp8-h200-sgl:
   - isl: 1024
     osl: 1024
     bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
     bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     bmk-space:
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
 
 dsr1-fp8-h200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
@@ -370,24 +370,24 @@ gptoss-fp4-b200-trt:
   - isl: 1024
     osl: 1024
     bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 8 }
   - isl: 1024
     osl: 8192
     bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 8 }
   - isl: 8192
     osl: 1024
     bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 8 }
 
 gptoss-fp4-b200-vllm:
   image: vllm/vllm-openai:v0.10.2
@@ -399,24 +399,24 @@ gptoss-fp4-b200-vllm:
   - isl: 1024
     osl: 1024
     bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 8 }
   - isl: 1024
     osl: 8192
     bmk-space:
-    - { tp: 1, ep: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 8 }
   - isl: 8192
     osl: 1024
     bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
 
 gptoss-fp4-h100-vllm:
   image: vllm/vllm-openai:v0.10.2
@@ -428,21 +428,21 @@ gptoss-fp4-h100-vllm:
   - isl: 1024
     osl: 1024
     bmk-space:
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
     bmk-space:
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     bmk-space:
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
 
 gptoss-fp4-h200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev
@@ -483,21 +483,21 @@ gptoss-fp4-h200-vllm:
   - isl: 1024
     osl: 1024
     bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
     bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 16 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 1, conc-start: 4, conc-end: 16 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     bmk-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 2, conc-start: 4, conc-end: 64 }
+    - { tp: 4, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 32 }
diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index c381f7c64..969d65310 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -13,6 +13,8 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
+# DP_ATTENTION
+# EP_SIZE
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
@@ -30,7 +32,7 @@ cat > gptoss-config.yml << EOF
 cuda_graph_config:
   enable_padding: true
   max_batch_size: $CONC
-enable_attention_dp: false
+enable_attention_dp: $DP_ATTENTION
 kv_cache_config:
   dtype: auto
   enable_block_reuse: false
@@ -42,9 +44,8 @@ print_iter_log: true
 stream_interval: 20 
 EOF
 
-
 #mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens $MAX_MODEL_LEN --num_postprocess_workers 2 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --max_batch_size $CONC --max_num_tokens 20000 --backend pytorch --extra_llm_api_options gptoss-config.yml  --ep_size=$TP --trust_remote_code --gpus_per_node 8 --host 0.0.0.0 --port $PORT --tp_size=$TP --pp_size=1 > $SERVER_LOG 2>&1 &
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --max_batch_size $CONC --max_num_tokens 20000 --backend pytorch --extra_llm_api_options gptoss-config.yml  --ep_size=$EP_SIZE --trust_remote_code --gpus_per_node 8 --host 0.0.0.0 --port $PORT --tp_size=$TP --pp_size=1 > $SERVER_LOG 2>&1 &
 
 
 set +x

From 869572a2b5e4ccadcefde53558bf8bd34afc9b2f Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 12:30:09 -0500
Subject: [PATCH 030/149] adding pytests

---
 .github/workflows/1k1k-sweep.yml              |   2 +-
 .../get_full_sweep_configs.cpython-313.pyc    | Bin 0 -> 5046 bytes
 ...sweep_configs.cpython-313-pytest-8.4.2.pyc | Bin 0 -> 55816 bytes
 .../get_full_sweep_configs.py}                |   0
 .../test_get_full_sweep_configs.py            | 842 ++++++++++++++++++
 5 files changed, 843 insertions(+), 1 deletion(-)
 create mode 100644 utils/matrix-logic/__pycache__/get_full_sweep_configs.cpython-313.pyc
 create mode 100644 utils/matrix-logic/__pycache__/test_get_full_sweep_configs.cpython-313-pytest-8.4.2.pyc
 rename utils/{get_configs.py => matrix-logic/get_full_sweep_configs.py} (100%)
 create mode 100644 utils/matrix-logic/test_get_full_sweep_configs.py

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 2d00fa924..ee1c8ddd2 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -15,7 +15,7 @@ jobs:
 
             - id: get-70b-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-dsr1-configs:
diff --git a/utils/matrix-logic/__pycache__/get_full_sweep_configs.cpython-313.pyc b/utils/matrix-logic/__pycache__/get_full_sweep_configs.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b29a85d117b2da207801b8a1143e67830ad9f45b
GIT binary patch
literal 5046
zcmb7ITWs6b89ovz>wcH4i)DwFmB_LqyQ$qaPMdaVF3!zyD~4%Or4ST}c2rB0LP|}n
z4PEvyK-~t^K|93ROZ@1f1DeM^^ua)}p%{jhno@{igTO%d5@5ibC0McbVfzmyN=_OT
z9YgT^|2g0J@Bh(p9$Kv?1kYzb{qM!2E`+|pjQWsm;$Z@a-y#~(niGg`jT&B~@LFE0
zj08{6TAH9qS~o%PB;6N8yzYD-t>1SRp|k;^^SUs4t}T#4i<(7E%y1v`r()Xs2vqR|
zFfpw%+((o`V+&gIgiP*P+Oveed*Qnm=nO1&9t%)dz(ZM&ZB2F-650;gA;RDZt)0<u
zh(QTmXSzeQ{Gh@58ItOSF(g4#>q&xtttVktsqUzH{hX#XN9$-KZAw_X2$lN(<k--Z
zmNx%@g%;3crEN1rPao__o2r|3&`#RbmgZDvyQAs_r|DYBXcAB}Pd+%H6^MFi<Qc?~
zfbl$uK*)`bevS~Q8$wB+y2|Ki_jB!RX%W|#^s6h0ph|VeFk<@Dh;sVmpt{27lS3-4
zN*irYWFb2~M0*&W4bk2yLiLH&B!|^K4EbYnxDBuWG59#-o-;58)sl1!q2!3FlkQh6
z^7W{Sohptos2-~$)Bvf{tOjh4s@U$x=v4%Tdl&wxaiP%M9}qG_vrQt-2)oC-yHrYz
zksHlwj{d!!*^e1hBCaC3BjkFb%c+#AUG+T+Ii3TCF(>w@nC?giRXk^5ES){A%ndLm
z$XsZ9=6XmO>`1ZpKn!&dwjPKb9fZ9Hg6bd~JrKhkgp)>R?=g19!8oUNFG4-Et1<k2
zpJ>Fn7z^~s6jY6yaX&RqBdTOZi_cgks?aAr(2)+B=%69`jGJ>&XkLqJfN^!AjWhIb
zSam&B^G;P3=V3gZJ*^Dw`o1@e2R!)`c;aQePn8$#$m{Qc80{c@JrJ=D!ruci)<Fb%
zAjUh0AdQ%Q#>e=liEWwM{S@DP+yFDc`1kE=XW$)(nw7D&^2}%%0^|(P@d+arVgeW7
zXk-FRh#tdj$^;cY2>iIh8xxT4Cm$xHT16p66aGEL`S}<m3_!<CKe+>;IGoCDyFP58
z8GqMKW%p^2@aI<V&uZrW(V~(ssIroKRVo3VJ^8@dYgDaLN3hNqLBnV_>Qn^O_=1)~
zJD|2GG^?Ha4x#<T4z!>dMoATf^eF8>u)ggy^ed-f0J<C`;D^;Lw0FiPPF0_sWjU5l
zms#op%VjPW()<!tNSFEC6>5Pm7N|^-TgWX|0Ff(l)Iu)L3Nejr&ay&=&z10G)fSJp
zP;q5h){LL7#^*VfD)LmJ$g@wAL5-d}Fn5w#DDt2z79(UGm*y7**}$`xDmk9b%H%~h
zU#gnoae=)Q&$FCR-9f_|7|Bw=E|xDoE_WYQ)$XQ1l0I{>n9HyNk8d$Nz8GK&3&kv(
zkC%9MA$O&^a}IM<i=)a#>{prP9~UUV^{Feu42G21Qe4PY+57l+wHl{^r!bG*z<V<l
zo(KDusdv)(3M){fS#}{^$(Q$0&tezJ@^Xol_3cT;GFih{tp}A5E3u`b)qyBgs0d{&
zubj?tRGP}>GC0m@ez_VwR4L_i8Jw<G$koIwvC9IOQQ@)_?1@&1D8$2Zj4M^js$gf7
z$`-*Yu2`nnD><P&QMI>Y+s>a*9hu7sLXKOcqED<u6(2zh6_qtnS(huM7g<@SL{&CI
zATv1uY?O@)e7eA1F7iuqA7A01w5r3ZOf|J&5S!0pb9t&Ik;n;onJ5bRs_{Z$DK3=K
z8MeAt-52f5MX_^GFqw<k#>Y+ECQe(1!om4~l%@G{)u7NUmzA|_sjAPG;^}gktNx^;
zpj?XXrlQ>)z=3V83D~Bn?HJKj6f^SpHpYALryqat^^3<hEk1xh_@>1Vz(*#r@Ys-S
zfGRJgd4ZK}2l&NGf#u4tVV0LI>1;LytIC{A<{?3{1zb!)v8RM`F(;Ea$ujwNF~`Y7
zjw{P1MH1cwjnjq-vISH@%cfe*NES;hCzH$RLS8ls=>;~GFQ&7y3DU(ubfBAL5*L_E
z2rNiEgDdwi&lh>QPbl(bXvDrsDGLQBJC8u~pDLD*;3{iDbh$EH5M+}QX5n(Kd{MS2
zC6Q{imP{;y(7tpDY{-IoXqXgk7+F{1u><7W0@REyTPc);*r42}_`}O2_NW^{*`Chl
zQ*E!nl(cN#=Ax`w=<v#-EvxoON(q>?b)2PIO(_#FDHFiRxcZgowxkI?lPSB^noHp=
zEjR=$e@Vvmno{ajCV*@uTd6(B8PuMv)LshQ=R>#-p!ia__;`H9mUWPRep$dY^+S(*
zY?6T*5hmemg!a#6CXaCoEIw=4xdUe>E9G2Xm~6Qj&leYA9pau^s6g0+%Pa)F?cqde
zStbi`s_?IYN_@BpQ}9>eRdm<tZCWFLwMHb*(FcUq)_3*DRv*%to20K!`Xqy2vN<KE
zSF*meWkfpv1EkaSZCSPfJa-+Q56?HfyXxLu4ew~f5yNo%#CrLY<&Ty>3pHns*JqA5
zW==LH=fsIqqIJH9Aitt!dP}dUAwi94n;P$3t3z@)AGgDXEUu<GR5ynj=Ao;H{$?Lo
zJ#=&K#$4S#_KTxe4{rARug*)pfu?Wtwr{j%-qPBQCpCAS-j$`bsoT!z`kuQ(;hWs=
zxb@c?Lz64!yT0MI@!yx5zUjJeT6A!qS~r$9-Vzf9@p-P{;6&f_BLfQWTuU`~&eV6#
zG<Lq&7=Gz9VdKT-LArjBZX9G9ubin5pINcpa}VDQMx@|@jq#1aFEV1{Of!+JCz7|%
zUl22w8t3^&LJ(gsi~h<(LOamEg|y!OEdz2=O~<afW7pbGZ#$;G4ItNO-5GtLL;l!S
z1bKZ|=OoWa(-W(EVolHZdxjdZX?MN<?tAa9W}8Ek^`Xh8{h7M`nL8HGW^nLk^hQ(+
zPc?$mHTymLkZ3q5h7WIKHkdDlMe1b3aI*IEO^5rs{+fPO-*8Yho%A^8@m-(2HoJPF
z;Tf$NrN=p^=eqBjZ`I#$hHHJBcK7Px_1LX*8-3!yY{Ndgg*3(&B-`XIV!d+ffM|LC
zApsL2EEv#^5z#R4wL4rpD1qsh-@Ck0`SrVN?yu~Tb@wN}k9@bh9}j&x^;PU;X;(~i
zjNEZ}ALx;1<X<KP4EW`Vg6DP5_&V6Nw`RO&_lt(47&<L7XT{U!#B*<o!DPdbtSx`z
z4uOugp`H&0Yp+Uy&`sYB-&&{<n5Z3>e8KCNuU%fPHhg2XqmKjo1J@U?Ew1Jo{iC(n
zO}B4NyLSARLEJIZaL<5q);BehD<RTn#Karo8)rqwIdBjrMOZZ;&rt1<<mz7?{mK>I
z91KhF4v*n?C?X9E;WrqPLevAdDd1YsZ3U61|Jut^II?nhHFE8Q<PO%|yQJs@u#o4-
zrem<_h}0dCHRiTs9Bf(L+i;KE@dPA?XO+BZyJ3TJSs^7@82#WCX<%gK(AE^fT+<bP
z6hNNA2m6tC_oL%lq%-{{^8VPb##Y{Hm_}}s5y|WqN&mM+EtFZ)8U5~25IKe)LhhhN
zT%pyUt)E!mFYY}i4jyl~PKcHhTad+*n*Rv+kK51tE%OfT=iyh!Pwm(J(P{wZFZ)gN
zR^yj?0_ZQT%GVK`AJfT(R0<zFsZ>nQ<4Z6<1|K*>mjxdG4+9^@AABldBnNkhRO%!2
z5O~DTE3+@b4@)J_zRaHi7GGtBm*E5EXf$6Vv+|n0Ms_&1tO3d5hxdV=bZD<y{%NUd
cG=Z%v1R^c9YJ&*eAq~GgvIT6AXF$aN0CKX6UjP6A

literal 0
HcmV?d00001

diff --git a/utils/matrix-logic/__pycache__/test_get_full_sweep_configs.cpython-313-pytest-8.4.2.pyc b/utils/matrix-logic/__pycache__/test_get_full_sweep_configs.cpython-313-pytest-8.4.2.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a9926a7c582b3781de9dd64d8422a16e2d1dcb6
GIT binary patch
literal 55816
zcmeHw4Rl+_btXXk@DT(Ek|4ofk>I}|iKIm8&$50jS=Ntj(XyUltCbxrM1mA-3ZNfI
z*`nLVNt&2W(ne0>6Q*%nrA@b@yQj6Y?WvP=&)V5`<EHI703nGCqPEUYx2x@DSC*3F
zSm$iB-<^4H-UIkZg0kdw)_Q_2XXee!dvE5>ojdp5xpU3qaanME{^g&aXc@Cuen=7R
zva5k>2doy$ix$};Tko?dR`$2(UR$y#V7b?>I0F2RQ*p98#Y!>zD=8BDTcVV(zb?gv
zzeV?z-s@J}tc>kG&%I?z*}Y!H8?Zd;JQC0ztpAnm^_ER)RH;}bkHe6sa*;evLmuBE
zd5R5rs${9&qEy#fS}k%(y+w9yD&nu9_*q&h(%d4ghNXFsRwmMFS(+DV<sz+a#2T!)
zOcf0Tty%lw$oPq1QPzGk5{qZu$>@{e(P%uJ98bgs(c<EwunqsOIRRd@ge>irpmnn6
zZY3HSj|L)v@#x91gc4Cs2ZmxwbZ|U@&~QQt3?|~kv5~-VY&4qenL(MXG=6d{Jcg$x
zsm?%lpf{9^D#_j_Ct{<q!PsE$!T4}giN*(`hjt(BofwaeCVNjt#+BI9-J^*S+#io7
z$HODh@$m4(=x8|kR5Usk=8sACjGfMU=~dK(FkVI><f8w{5`bqdSDY4i)w2&=L&P#j
zK)XB$uur@f0ic>yE|P7sU3SP$xfuT>xdi`QJvO;icJokgk?cXdST2*ja=Dsjr8L~B
z=y9&MqM0fuyN*Q1CzLpv=~QGiHWV0m_~71yyS27Sjw=&`xF7AgUcP1h@$TWVjorul
z*01j#SH`dR#ZL_?J+VYv{8Vfx7Qx5IlL<u`?;agJ*{eiHqmg8E+lHPEJ?oXhzMio}
za=dS{Ma_HvXyjz1yT7Nud&~Mg-3R0RLEZZfZ@eyP56If^xU3D2x@^H%yG(s@`B{J&
zTefI??79mrJJ_8ZkKkP-7LLY;E)yPJzKiFw#ge00TOv6+S#td36Wz(N$YAucC4dYx
zb}yGxE@##eI~f^?W*sLJL($Q!Q<;dzqe`}947d_Y#uD*t$*>YR8GS0DJaOF%EbImr
zcB{Rr6r&CfN-0GBnDw!y4D(M_yIWa}JM_ThC;+-zvXnH;S09|O-a79O%-1#Hhwg6-
z%r`W@)f{-m_PQ@!d(S*VMX%e^wR^8PY_0yci;F8NuT<)})O)rfy640_UwNSHZBNZA
z*A-ix+j+%O=Wr_YVma94Mgqg8bqFkiO)j|<m|~UP40=?2DVuE1!<XHM_AW%0Ngqd+
zt=%VM{hKyD-F*ruF}X`g44&-W(zAXW1@JdKaeN{^J`wKg>sh}sOnJiVdp6(~jsYQJ
z@xk$M-}=5y>+k5>5>^H`OtwT$4)rdIIa|B$dpPUl4<yvtnzd7vN+mv0@c~>8P^g|j
zHGu{KEd<Ouv7C1z9;x+U53=^+e{vrH@4=?|>W5hmQuoc*H5+@d==Gv>?cG=OzOhNv
zH{JmHrh@vW!mMuyMOfd^LrnXI#yX933IAxcdwT3FOye9Qy~ojO!}zAYt5kc<H@Q*t
z8pqqfQ?c<AfyCH2&^t03h>is!@u9%bSa)Q6JPwK&9UV&cOjZmfl?^##UMa&{O*W~6
zcrVDC9F0EFePl!T`b~XXY4~<O+^2YvNudu>DhRL+Xr@p%0oL}c7pf2q+7&<kWQ(F>
zN)6qv1$gMP1%o~4V7HX@NV#6DzMP(Enf`0wUmB(_4QpQ-=AWkYqe^PvWCVbBWg{Dm
zn@k6z^))43y9a~uob|QJH!&~|vG6NHZ?>h5eI#8QzEW(b5h!5<+FaDj6;0@6AN8_N
z>t*Uy?V_2A_3|_VFItXTyD>=Pwqy8eeuZqZRht%zEOGlWb$Yc}ju7d}yIL(r+qK+a
z9g66m9zRCt&90Dbllp{qON(Wlg=i98Lr5Ji3Rs#g0sI@N+hJ?A3|oV?$@T-$(Xpr!
z7*7NSwdwSib|1Q58}c-F_GGP34Fv63dtxja&)QE%PL5{nLlY;*vNlllpaZii&3c&b
zvJ&NwgO+AX)!rG3j7LO-X5OrIIO)JE1?b}Bw30tsn>VQrWf!u9@jrPdz_XUOD(lmg
z9nao(sl+q&#B~4plIF7;E=lgsm3+G7%-Zu(%R4rUEAXSzvZ-e>r41>2!#fVdFO)Q2
zXA|!WHFw&+>DpaVJc!D~rM8si+CxZs(efx`?UW+fs@UWr#V*^x06B;O17imk3v3;j
zC@^(klfcpud*mY4NGX+FfNmKqk>ZivfMv2r@s5-Qz1i}Iz%U#Fd$Vr>)DDC3@(3O_
z*+R*I<U|bo3vnvrC!zsjf&#;8ww^(1Bkdv%f??Bf45b$>1}w75)=<$Vx`!+72-%LP
zVV=Sv=Osc8m79=yiw(FVo};muai;;FoW~@sCap~3DGjUkTZh(Kz#hu3WAujXIzn?#
z-qk8tSnA9nP>N$jX4rLv$VlGRYB@&Zhg}*zYtM{;uoy}c>B+myN<UzgG2gA|Eiri*
zo@4r!A?Jd(L;n~`nZBJ~3U3AazM{7(r0w(`Q!#u+ZMUE_-qOA7Hx+9wL9<midM_>=
zqj`W`hSD?@<Xx?nkffE?{zk2GS-kjIot9N^vv`T@P0WN`+YVSPiEnCO222D8ZFmk%
zKY14xGUcJtjaIqh=x=W<+N4rhx$>sBKTUA#(!NG}%YtxsL20yq<p&dY4ef1?=_OhV
zESuA%<&=FY{<!5TxjN)tZ)MozSphbAw7RsvgiSlF@sb_Zv2TQ&KFj{%1IK}(WwJl<
z^H3Slq{NrBJ{tPN0`RCNR02GzJ^F7t9xeNYUhQq<x)otuA#|urt6TeuNFj7clmdp;
z$@M)&tfkB42Dvd*uC{x{3fjFwdkgI^Z}+ix<yE)anFzuyZ|A1l?fky9bMx(XeqY)-
zaJ!w~mv(Ns-OlezJGb6$=l7+Z+ith>`_j(sx7+#sXy;Iw+(KfSxL58-d^A+d*1r0R
zB~ieNrJSu;I*-0{BP*6=*Eag<ze`<3Nb+hq7?RXggnI?62)DL+(Eh5cH!oIiUqnl^
z-fjtMiT%Zgq83Xjq`4vckyV6+xtEaURUT{7?wBc<A$_Sl(yHaq{~qbk!&*qIQ|p<y
zFXRh32^-^8p(<_dIdrKYe5}F>dJUD=F;ZT8(^tkm?OnCED`;gbN@2ye_Jb*Vk=Lms
zty=DudqUOfNPF*6w<^f3dnY$C0rl}nGB&8H0w$CQ$*;L$X~Pp6o(MowIzg)P07U*H
zP^tG^FXeip9#Ropmy)NGJrQN()b*0&pb{G!hl2ciS+}ZK?Pgl}>#pu@RpFY<+NoB`
z!SuA<P|ps>p3d5!X}rEEl7yaqJkYUv?DWQ9()ElQ2<%`<>x0SFl(He1T*ngng2|xk
zSF<jtccEyE;3c8<jV32X$Fug)SaN*Ep0$mgUQb{Hfj$Bo2QHKFe8zgcXvdBj>*RW>
zvR``|0t=sphXi);ci0?EW^HJR8N2czYEg~=P_6w0HeW9@e8E=AI(B-iLgQ}6nk|aO
z@wiI$3j$CBs87y1qVaL%^h^<}O*w|_ITdZ<6>U=5=!rC9l}-Zf1R4ntW#IZZI}|d5
zOjT!mmiqE&Q|%$749R&2K1_KOiCQr<w)V|v=4|QX$wWMym>3_M7)KrK+vWT^P`^SI
z2YpPjlPyk0$DxT-6q3R!hX_0XkagcRb~=0<#u!izC$rAc#9(AJnROwR8D2z`Y;8Ds
zA~7*K6jq>543CT^jz>nrOyg7%PDaRhV@%1G?Pll#biGGW5=zz=hLQz#9O3BGW22Ec
zLuBQb=z$esJYkeoHkdd$7Ez*E7xZ95_@1buWbO2ES;uH1GK4NeV%7nbS8Ock&)SpG
z(P3ua!8HHb(vyk!6VcO9aSfixItL?TK!2h5Vk)fxdSEGiA;*P&9^tRH$#|#F7|FS_
z(O71SF>;3jnPyJ?>y5W_3U1CyctJAz7*d|dE49H`sy|5rd#0*e0J!eX&pL(?3Qd1U
z!#LYT1jMNL>M_swIO<P+7jycvmLK_7pKE@tJ9Y5UjQ_Ez(zhhvoYat(8m8}^ch}6j
z8>gR`ch^pLQYd<*#1VA;#NsF^{Yjan(*IljDd$g|mh$@P&abv-yj>}&>!P=7PRjYE
zrLNCTinQDqPrfL1O?Mh|7-AyFSKEy_wAclyD`ob?OWw*en?LhRN@|{tFp%~(&q?Yp
zBQ;MwBVs&oL2CYNQlzDGVm$ex)O=>MF^3jIQLQ@EJRK2vL;%TXCV%79J@dZ$l)G&@
zn)bELxz*pSyY0*t9y_lG#ciZbM@1~16BEf9ciSzkx+&%EI@^)<b<Mfe->kdq%$+=T
zUJr`fNITmhV(FZiNY1#s=6$!Y>V}lNeR?A8YoBwgzgc(tnXNo_UJr`fNSmGzv2;#M
zBxl_1^S;`YyJh;G7k19MTljC*-Ew9V51!Y9;x^Kz?-8+dPD~_c+%31V=9-i{Fum!8
z9dqsg|INAsXEyTSc|9m@BW)U=q-IX%#6)t&9hmpkrQEI4p%-?|xm)>f*4>KX!D8q2
zpty~+>5zz}b7CSn<8DQ3Ojp0Kose%M1Nj)ZUB^H*2%I{LZ_*K!hZ(BftjaFByWXzB
zc)rqTsj8pzwFAR4zRoG>lE40okAMF0v$cQJ_?5=Fmi}~0f2L(q#=m*0blzJ(J)HJ}
zh^fDf)IRmh^stEYzy(SbY3ZC8PrfL%8%iOEAtnl<a&Ddm3NJ3QS0hgA+IgfdWzLXe
z=uiaX7S*^R?!=-;4U1Y4;3kttA7)0AFr(^~G^2%p<(PN_Z5aavvu-#emauV2b|t1m
zCEI=-YQdMaWrM0<mb9lG5e5%hiiIp?OG7R!YurblH!NyA$An455n=XVETz2<DI%G1
zg#Hv$vx8-fBlOayO4)^_vB>erWh;71QcCL2F@4LBXwlW`$FilDUdr_C^ip`MGQ%^>
za|^?PWpfHuwCq(C%%w1ztq7Hxh$!@TG&ff;6H!z`!7O26J`*bTSswNzi#Ax2MLx>`
zU|8ZGWFH}4tmqJ|M`_6(FN=HRDj__Z(prcqpY^Ak<5+b_0*?7}l+St@^LEme&owu_
zpM@G(`-+14S(HLQ*T}U(`3yz0sq$I>Li0W`Q$C|l#(pL@+-_$YfVaAxM=EJQM6xLE
z8MXu)Cj-R)b78?rEW2Z4qfw(xVX{Zri<g>|V1CQQ1<E~$D*FiRCvbqkK?3&@xQ{>)
z0G7+vomh(Y7PdIw$(GWzgr()4LQ62^mkB&Z;3EJ7e6`F>TFKy0i2!7axsiBM8Kk5k
z0#O2M0A}255v#;0!A^kH{Un7>5r9C}qKpwx2qX!N6PN%9Rw}<jVH(_Q<*4*f=vM)<
zs{8~-tfSE^H4O{#xBfWg`UC+?Ocv#n1U^N8mVL@^5qOTk6oKafl0*s?tlCtQ%X+pN
z(|Sf*xiOhccCGo+%{23G(u4jZ?93J{-!Az&&L&=&Ox-PKd=E`YH?o@Z@s(54^n)o0
z;Q5cvNlpBhk(!7*QDZ!CL29CflbV*!iSgu%QWGte^c-3YMYZZs^Ynuvj|d?7yX!Y+
zDz|*id3IvD_6xgSbxswhD!0sg>r&u+9x*ahvtX96&LsBgydD%R5z?j~5i#PM^q5En
z-!$*7NlC5KThiXvIZ6G^O082*^VoSkC~hNddW(pqb7CSHncltXu=utnVWwyK>vd)K
zb=b|VO@D+8AMw6uS#Jpy-D3^e?y;Wq=CEYehrEI%b1cG=v3)Fs+n$EJnA~7h`?y4w
zS{iF)&Z;H!iSzft{E>yHK24ZTo?3l^)mnx%TgWV}Mw`*aZ29!krYz9H(jda6h6&iT
zl+FSzn^Ukqw2OmTZ<QSL@%ICRroUa4mZtxO++VOnnv5Kp77wP(r;m}HKD!KdmoB+1
z@r_U^(>?v2)(R?HtUpOuFKRU~w%8kTqgTt1{;`2AuDI!aB^U$kd$hMKsIL~3MqgFH
zI&>u%MQpL&7G-)HXd$$^xy`U%${Z&|DZEvs;Th(+g*D}}IZL&+5_XqhOksEFQQ6|M
z72uO#>9jgIKE++YslP3VI{25hOgG2On%kJEVVL$e&*uI%X0nDc!^{y+uy%3-V^_yd
zM8*S5`xM~XC)zs-ML;Y*5}@6INwBaA?JylZ-81Qd!r*u;9xa%E&DqmRb8kQXUnBNy
zr?m~MLfYAi6=OTpsn}zzPqYsenaMh|sZ9GM*-C3nn%GWRUUUR3hdTLMbEIgCjkfye
zEc8IIh-+Ky&{rgb*lURmob(!5ul7kQLzpBPOCazp7xkBHABo1HK~Z*QcP+vq(gw>g
zZ#rUKdDHc5=X#u-J)(lIq26Q=;AfY?Ii@NPf5-p2<IJX)_MUB?YuS{BsH$a4rg7_Q
z73s!1FZ%BMj{n`H9l1ZTuGe~fok5efgf3830otJ6nX&0=DW--RP(r9pd6>Xa0*fsU
zu+IrQ<Svt#XvUMz^?!v*>bySV|4kTvUO6ocg5~+FKVbl4{gsbWXn??H2s}&RV+1}<
zz>M$z9o$F~{m<j^@8Z4Hz&U)=9?7m-N&kVj!eXKborQf^^@mfd)~5hBJ^(o7y~LC@
zy>ITy_>WDMVtZss)k?KCz6Y&;<IhwcPDzJ9n*=UoycE3PJv=An{L*w^q#0rtq{Ar2
z?_>~OP;yo}{DITMEZtPYW(qFJQm!tEJ`l>J^HMXH?b6WHE@HchYFsPAih2pzu6@a^
zYTCy4<+iFhLJmmVoXMhIE^Wgmptw^m2|4e?l5_|tdkC`Tkn?cSA-oNfycOqcZxb@g
z+fv^GWYfe%=<KjL>nU1PFyCOT4W&pZk#~V97Nx-6{=!Oq4@?33JRuL4_F{WcRpQ@5
z-fa*W85XLTFAS|=x=024%_Y45TVF02%9yvHV7`~lX^;gAscWrVC#0^W24g+Fx6zCF
z7y3;sA2comEI&!YvE6FybB8{y8mQ%t(P}Y9QK2?QA@$YAES9ir%!1ixW40n^%nC{G
zhjh#~7#4M=OH7S3Bwlc+kTEM-bVXxUl){*8SQhJqsAAciLM(C1H8mC33TQpBSsk?{
z#6M%yHnLHBV8x?$6&tlpM}NUbt=ydWF{faOKfyLzP%n(LK)f>K3RN;%_A`McV2eXu
zEj9#+w4ra!DO=0Y&uuJ{TaW&aP~}FOwguNv>b=r7UGO1k)ar<Nos}BJlG{|#nNMz)
zJ3>BH@KLn_Dp;k}rTx{ZAn63-FkOzj;+6ZYhwfM*Mbs!{K1Hm~p@`Mmb3ddMvGb-=
zMA4!eb{C}66s15BJD0`k4<bdZW)$)Bw<)4Ng2z4wj;>yv8@gz2xPPVm+-f#61b@+I
z1_Nii=EFTlf|9?;DY?*kJBP6X4UqmjClp*edABZF)Vvg#`(p~g`WVLFgQF8e(V@Uo
zC!%qZLMCAX4$FG(PNxu6`5$1?U@n!IzFav&QH*FV<jPi(MR_cmWNwpS&>V)nbiP7-
zm~xXVryxw>x9<oVY}2=MV|TLWK0&!+1jssE=>y1C8*4|^;dmlEG^W67i7IUyp%6Le
zP)H{QLF6A(m_V>dw}4ll0`qpBhCo0~+XBme8#jlKq7>QDZ<#b2*WbZy;_t{-<h;xA
z#Q2Ga-~q5GmNb4(EXfQ6=zBKMn|y)Z#BUJOnMmle)~NC_WnNS?cbx7rZTzzoVsgCy
zfDkvW=JLzUMmy`R-1429f^S%_{3$XoYkJ+rBzzi9sh+Q>(c$U~tsW*<Us<-1W1$`P
z(xv_jJ^l>DItH!&P{to!+9=yhX7#qyvSxi|)$W)5XLrA{@wF#k-SK;o8GlNuPp#Sw
zkzC5xk@0q>q&4R%7)X29%t`7mBdtk!VU4Y)AbdeuLoz@$Eu9nN$rn+~*I~>eq9`nC
zMD6D)L@p6P^7pE{$UM4s+W*3il)Lln9tP6B&N;XGn{{`df$b{A&g;Pp<rR5E;DWpJ
zZJWiv`fbV5-Jc0O^76j3PrfqrTIAK(A5^@4Fco?@z4zfis+ifAl2)YxkFYjbo$&@!
zQun#t45Yo?bCUYYNZl!KP(*p)g49iIqGn0w#CY;W6!Wb%W)V>o7B!;wbGt<@5kT^#
z>NUJgs$RGgX5(jD7|6FBhZQzil9TPY8iY0YS(qj3CgwcMjKr%`?(XSRUs^?fR;7L2
z_<OF6Mb5P$I_K7ZX?ORTzH_LBlFn0v2QySu1hou2K|o}B+hOsqecNRT_GTI%c-eV&
z!z=BtwY(bq@`JCJy!qf8Wxw~}j58(Gr5Yb#&5sQP9Vsbz&dNZV+FSi)q+rSm+|pAJ
zz90pu`PH;^PK+mCL@}6t8?uNf3X2+1`#Gz~B?3skRNZ;go4<Ct4*i#MuRS}?fp6`c
zTm8+t*QRJ+07cIm!Wqgfa)|(v#dzsDdpFrf-<@wCjqyU3&}5;n2FX79?uGWzFloM8
zZ1H!!d$rlpw8#4HRkx+(DE^dLRvoq`dqJ80-1S(W{SBKF@Xcb^V_WQRR@(`#astkl
zyB^zapKY=e>~sPSnu@27LOA@si>D!kP7KRT&>*|ELG#4KUl3X|aaV;Z?#dHU=Lop5
z<VFpp#EwF!UBXKc(_vTw!eeA_5i?;vfZW1_xgg`xGG@T+%`spuzv;^>vGiHd@=BD#
z@~V8<<&`c*s!+w8rSPNWyJ-f$RS<E`j8vI2ebtkVjOiPS4o6`87)Xx80cSEc84V<c
z1AW}|u@Hkt61pmN1Q<&u_$H>gG}(`bJ&M_q@v%Nq-wx7_z6Yu7F#<ybh+`U9V7{1j
z!mIB%wt_JY<|)buDfQwOjL?MUSum3EqCtcCDx%p^dJmNaQ@&0a7`w!@n^P2JHj2Rg
zT>YhSR~a>1n3^rDHsoTv$`)k&XH=W~K7fINxv^bkn6}%38UG!>-#Pu@xuKc;uRJ+(
z{}-S9O6OE*%74eDs`{B{G7WdW{LHzQ>EV|S&ODQDxHD6=eX3-BRr6Wj^ud(WF?}#i
zQ-=DRl{%(Qix>}F$Vp4<G2ECwn3ow*)IkP(HQV0JHr$y6)v)}zdvB}#4TqDUzie-p
zltc5#wuq_Z9=$0vkNiFMmMpokB3~`X=^mq6vim*IJ<pPK5B_m)nC_J`x>u1y_l$N{
zH%HJeDA*0tJ)=FExnNVLdp<$;$fE0JZFKN6y5~2gdo`1OM)w9M;KTi-PWB3>c-Kp@
z9~`5;n?`-sx{><_UW@`NuK}|;ZTSX;bQ-68lj8pgAh`&Isb-5OopA71&UVuKFI+C=
z=ldDe{NDgpMqlRLb<_P<N}aTI)rqaE<?7beUkD=e7w*07_BV>11Z&Io2BkqFD%vH8
z%vGeCy0umv*!RK&R&_iM7xg(Vkzsyn*yCjnnc6*ZclRMX8E48qxoR1cL5E?FmqRc~
zPFYHvQXQ!XBjN)0cxl?%5nVeguWVNhIdPS=hPxpLxO!<*B4BEGj9oT2osknQx*~!m
zN`aB9HME53x9A45%jOhT1ah6a$E!rHmm5MQs!qqX0-cV~BxyJUiEtzII8Kni=U~ph
zil!pDiD0}`E=l}t$jwZnhqaa%0uN<)V{?62HQb+A<>sS*spFENjCp$qo%^ym#XdDT
zpiTuIxkYXj#=RvgXb&;1<h4gB+GBL2G1xZQ!AL(7TN3-p&us|6M+jUv(AJ;9Zk*l#
zNhT3q)d|RT7mKjt8PqVdzEm^uJvho*w6U^|05u5n^l$3qe}?YfLtr0){R9pWI7r}L
zfUKRKSAgB$0GFLIuA7N0Pf!&x0!-M+#DrxAHkfVSadIK8ffHsd?<c6(uM_wrflm?0
z6%Hney~s7=<9u)<y)WbD!%8GR5(PIL9~q5?adg2%G~4}<>10K1ZIay$lc?kg0Co(C
zr+&V!gMRg0^Q&6tSGB<&EzmyS-1b&m`zw21-;-|KkCP2L>v8Htwf~AuYOln}290Gn
z*`U$|UkmM)njkrAc+;QpA6~gw@JAmsuL$$r>NAn&Pfz>P-atwsp&x-X$sg6<tkjwE
z26*(mA)KL%B9{mt%M!+Mo9*w1^nv!#kb#_L5R#)b77_#9pZn17|9`ZLMtq-P(p$!4
zP3)qPR<P?q@bm`4NV{l847+H=ga_~CWz2gy$iV{d<-|g<>qxB@(*I&}xtDn_FVFE_
z{$brkBU)6OZkd8!Z$a~(7Nx*=7*}`Zxy6*XECvZ?LF}T*_g)SI4t^QxGT)VD=Di&F
z#J!heuY!NFpU6JnT00aQX6IEw#FXodE7xh4y4%)lawM-?Tc6N3+2(9gAO@clVAsen
z-LORgMrDZ#*}_;LE3rL0tfC(z%w(33RW4$8ZFE=Nkg!TsncCTg1P6Ce#in%Ju_)xG
zZh)QC1HsixltOZ(BxDwt8rgKH{4T=TJI!Z3(G+1c1)U?7G}(8sgXTbbRle=u;I}$o
zd+<!?g{{-$q|5MaJ!d`FF(bXQH@$l6dEeG=b@FWoA8+fhDrZqj`3eD0NxUe%SDtd!
zY<scthsYN+vI_M08?^afluxK(e8R#e$gVE9v8ZLpYI?znc4<R(GfzxV`uXm)zR{Vf
zgaGUy+4LXGcqw?n3vpJ?FHQGFnjv;UI*4NYP6puxC1<6B!lwU&utWH~)U?Ee|95EF
zTd!{DwIrh4A){eZ2LkHr!6O)+`4^q=7b?a&#C7yxeI0TGZV#PW&E~e`Awhm=2++ld
zwqDAd42x1A+NFjsG0!cgeN5*3Km&=qn+R8=#tpLilg&x7m;ek@JGmh}HR27HPj2I6
zn9MZdI6aA-HJHRHKJfn*p!0kbqR0>&4dv73C8_aNI8W25F<+`MF&B|wB~Q#XQ<}dg
zsZbHV%dmDaBS;^or{&4&X$h+@QuqR;ju9SMd9d%s;zUs2>>6{@W0v<WB0j%^>K727
zmAVZ8v8r!O!Dn3M^S^?9Nc=Y|HB6lnF&?;(lLjwyS`OTpuFT7fDC)R9ywO8+eJF-E
z+akxUuuO-yahqr%@@+WxNMQFQoR5-H>|rz`7hju=Yh|8pe-|KWHt1F@7<BAq)Qy+f
zp5+IgG5_Hk8hj^BHmKe5!N<76fna%73NxRd;c(U!4&!4cAQK5^-Qh4vo5&j|V%{)J
zqLk$LDE8=r=!CPy;qXvmFdSBXgIf2C1pYmNKPT{?3H%j-zb9~>K$gHi0%WVhIF%M|
zLkG#IN{JnZchDp%f;Jw<zY!=Quv-HpGGYhUeaiA(``WAaR!7Ix1J+7M+tt<0j=<GD
z)_`O6)xFjVN9WbfN`Tb?M;GGdj^?ZFWsb(51^{@yROKN4iwm8_u4@PI=VlyQr+7x(
zK~J`VkcN3nJpl7DwAn~3f@e)OQM%!zI(S9Jd37h$JUux&0Gdx}$#9LD6}y>>j%aU-
zxOU0bUL0Wfpg8<6U|H_yLw(LC|892_6<JV*8k}94a3f&B@pky9D<(%AiywhF;!ux3
zbWXB)D!{)>lU{1a1_y=`(PX@9JirES4_iP5i<NzNGLtV(`g;wdx|jUta1%}H9bN9o
z_FhkxQDxv2)Y{<5$iZgK0i1&wNB*pnySas{;z%qRh2PtUNzBJOh4~qR;iIm2CyArF
zKbx<Pvb8|U@$ku5l7xgtmp*(d&goq4jnyPCofMh819fAsukL{Ls>R@LwXEuMkA3>F
zm$s$4cVs*}FS*M<yAQ`NmhGDJ?7ZmNc|A!3;!jF<7unt@a_ufJ&Os3(XADJ%>HtM>
zexb?9nBr_pIvH~j&k^FJOLV-Q{^=bL#G}P$-o;NpG^m3_4LDsDI&HZ$N0lUL#m!y)
z8P@S-T2B41#xOEeB@I2h45*s-#v}H{QFY)1N7co1jM!CVcB^i;49Lt5^MsXaEY6O*
z?U{6IA#&2K&?`;WsY%-cl(vm{S(>D_1&$4vi45}aI#{A$3za47cuK)>mfR~AA&~N~
z1pX(1Ep)$xUPVXd8Al#6D?2FbPJsOLK`R(yt1lU5=a056AXgltjhV&>m7hm_KxKc&
zxw165zr5tBIJ0&-dEV1@_Q`p-=X3j?-=C`LJ?~ymDBNzc)_rO4?AlCI&myje)v>Vg
z_jZ11=LOFiF&NgSJZs*mMvWIdZP(wavXrgG-Rnv0XtaF3a*xyYMqlZk8r!#=u02)7
z->Px#=~!f#+*|N4S&oW=>e?Q(h77~RWZ2MzmUpScM$lVmXz@;TnGPEo;OsI#2vk2x
zf(|xTY598{AdeM1K*}bWb2Afak0C;oIf5hP**UE^O}=NcgnY$wf=&Ykt}<-Ml(k0S
z7XsM|86A^#Ct(c1h7_IeYOG(mOm*_n!-j~m6$z#TXq;}?30wonA0Jh0d|)EN8N4`B
zn=eJVDT4{o&&&5LjgrrzRz6Bf)BgV4Q39O)+`&&Dob|6cxBI-i=RJ*+ov97?XFP|*
zFd49>Jcn||$$<5O=a4#3>i5`eZ}gV#sj_{`=Gs$H{H-e2p4QxfLURq9Ps-Uqks){F
zPGDsx5Wbks$ByHqWffH&hE%)mU<I>Kslo8Tk#on5Awy5V1-G;2CJ7KQ$X>Xeg+oy?
zO2~0LOTEUfBg7u%T{x>&%roS7NgWXMRqQg%7=9W5^f_+Czr=jtSLcJ8$vcQn8qs}d
zVr(=v7{R&2<ot#6HG#;Wl1MTot;YC3MnQ%%StrxYZoskY<Va+L<`;2;;q>noJzt-i
zuTRU@H|gfsxPgr#E!J1aK9>7X?_}d%@xHtw?HCGrdp3Xuf=+`FM4xkhjN+VYD$bi|
zvc>8<X@S11gE+Sh{7+w!sGut7;8G4B8(^a8!u0Sg(|4PUuiR{BXn4JX3c1#`dA_cN
ze)Vlk>)JZs*73HdxCMIE5}#|T_=?}+StiR*{FaKgv%6jyczu7S{QfD&rSeuo%)qm$
zPV=Kn&s^Z*?#p=Y7bEhJb<T7DMbG`$lcY>}qpfuJ7TX(JTzee3<4*8#v^>f;_vvvg
zFzk^G<7+X)9>#;gum=+Zji(%Ltng8UF}TDis>JxLG>oFkB}P%DI*NP?Ck^$?64i_W
z+!X|WobTr2MtKv>qWlek?+_qar9#sjSp8NCw-IP3&_SS+zzl(V2#EDXp8Wz}N7NnS
zj|z0<y+#)w5HscjLd+oxIj_H?hmr@P{F#N@A85-2Rk5N@Fi9?GFc|@h<o`1o3bcR`
zL&0c((E4^M9|z37psH~Df?N%R%8)IbXy!bJF4D}x(ArhHyUF%OlWX_7T(lO`$yK~&
z9$Gu)VoW1gDlqFc7lWlHsO~NR)fs{V)wK!3cp!V>sD3LZ3@-MHvQ<7?4U-r<hJ!-j
z3N1Q1lw_L}O>jLI%@?)Q(eOVH^}7fG3&sB$4gd3Ajbp#!0-7nMgln`2WOSTU@myrq
z3ak6(mPO{g{G`xH{x(Nui4pCNw^UP_R;(hHHKievzhYCG#Zdj;jORW<!S4qJzwe^w
zzU$W*s#lfnUTu40wQKk0TvQhn{J$?m^<qNxo+6lPmT(R#ha++ofZ#aS9>$z@ZmcO+
z+#oFnFSP_MuL8$ZWuWC%OVIKvm6lhlW@-imKYsFw?&Mfx5PNS6rQ!^|v3a$T(3UB$
zTPRd00{b)s?h?;dN%!K&%g@=-C@ac^**kLu?-Ddv<oq!q>~C^}b#jC?n12bJecNFt
zdoL&1dwB>+y^J0Q-a9?sm+|ZuNO}NBy8ohQze<n2rMuf~Z?w5~Z^%W`rRXt3QWD%S
zM7?!3#f8)3YT3U8y4JwYeT@NKYnDLQ8Wmk@C%5D?@Z)3aXf>(YSFQ^{8q1<Eu3RAk
z>z0&2WwzCb#1jgPJ1okI@Hr^(d3>yEU7$;UJ;d;SJOMAbES7W4#sVzYXml?%Mj^<q
z7AS?0M)J$c;yg2XOgC;>Xz+Mbsq~{6&tqZ$d;|mFv5TI^t}7K7n5tr<OnHkM|N8`f
zK;VZ2enjAZ5%>v#>ja9a5Lt98UIIP>eggl@?h>G}!}SvXLLmYX<X~GUNiVVbs@>)A
zU9BwxfWiTw72tr?54gS=V2`!Y;rr?KCWnvLMKuiKzqn8vyRL1=pBvRhD5WE=U}@IF
zx^`$JdUph(@7Ox@++;n&{p1A1iUDkm4@7Z0Fi7`v^$ubxOW}>kU3uQF0=O<0OZdeW
zAKXoO!F~sCTUzLL`M2ECf^VjLDmH#15Is3IewqlLCaCC{bg{HTB?^UTUV}#K2-34C
zVu55hi{UeLV~W7@0H%ssF1Tjzr%C0SbMTxCt&JIaiDZ2R6+#?W;hS=D|ElZWbY}by
zENy3<%N+7E@xY1RVsi*pG=>+pScu=G__e$65qYa8ZMH7Pd~E|`dF$qD;`-U83D4p=
zdw@)~Q&&@wMYdBHLyYKu(3*HIB(eP+Us|o!UeI>x8g}e*?a`+Vc2#vttBy<@cT4Ic
zE#`ZLbnltkyA|ZEO6-<_T~(bfhFrv7nlN*g>s7ldm)syXhFt32n)m88auw{Vrbkxk
z+X{nCldX*YvvEI)gJePb!S_tS0e5`xbb#%L=uy6nw#k>atfjPd1oRCf%mqy~rLBm+
z`CBS63;;PjRdJd6LJ1J)CQwhHiogbdm<5Q>A1VwyvJQIdWWMPZ+o6c9*trVbd_wao
z)!d9(&2;lLjW@QwRRLe-@|}&W!HDRnTS2m1W2Q~8obcplqH&un!sC|TBC`ASjp@b%
z3pu8nwnYTzR94i?Y|hm5y}bEsWV-XEl9|ovn!ZeV|CA$JyDKH_O4aU4dw0!A>Tg!s
zH8mk(Ja8c=Ev?6J1I6=lBZ^Ai#SX^Wes(y@8{V?L4U6n^ycd*i5gG?ovbw{OD4oY-
z30+nIN8)AYliB$$N4Yv@3B7U}=U;}#)!Z<Rt7SB<E{Dd|e*kG*{R(JYgP?JZ@~WFg
z<C=)Zg^J~7>WjD==L}v7dDyHX_MjHDYp|C2lM5XEdwo_}_S?i>z-4oa4G40J+6rZI
ztK1eUQz^Z71-k}?uVGDRj9rvyg}56n@f#ye25R3v$rg9q;Eu=^?G~g3E!WD9C7y~Z
z<UNCSVl!771(W{Ev^=;xM3nTs(0_;04#IeuX#HiHXD=V78as$K3tG=f^*87SdtMZw
zfy+O~rHoMG2?8+!j}rJMK+c%}X1wN-Q$XH6Lbk!s>`VaFLa2G1EQHvkfYya`w(&4)
z*l=PP_HEHIzN7G-+(}~b?1MC2JW$YdvFG)>|6fiQ-SDK9s_izq(wddJX}ZvTYatA0
zScWus($Zq&C95_kD=&#G^WLg6N1i`TXFK1`fboFmS)9lWcOT98!CSr-q{Er-7H2Yx
z0CJFD4}I?07m#wc;}ip57=OKu%{|8u<rhEu1E+=*C0as7jH2{wyda|}=qP?2v1=jy
zFYNzqAsg)f?MEvO6h$v%s<;kWwJ~k5JTcE%q~(+yDn+r=39TW!N>LoTHJ_m_8n$^C
zJdTUq09u`(DCA?;3X0NaXc6ws8G9OTw+L@U?lcNG;{I^A$cPm5OfnN`?9#w-KR6#I
z0QW+4kPmItU^~ayVLw=HXlMN}i#G^K*>ZvbM)FN)`hb!`b8A_f5LV`R0t-`F*elTd
zZ}7I1q$pF*PVoC|QScn4K1<-^1U^RKlK_k}<s3g}`1u|Te23V35st+{&l553FDZ|n
zn`L~CDEU=Xjtv;zhUtB2Z(B<0Kl>{Tq`m!flKPvK`cvLE9zAaeXDGMGB?3sk^g*r<
zpTK0BIX0?f&&kN_OCY#QvG~kVp@f)yrswi7w%;a~Gv>d-gzc{^fbEA%6L{;f!`cT9
zJLI~;*?ygf%JsSXzBN**{f(1tR0A0P1ujsSP(u)2a}z!+`ssLzRY#2uP?~hZK64?3
zCXBfgjmr)t(4RtsBJW!CoIJf0F=Ba_u@r<FO~;nYEkq!4vpO-A$N{;<;4p+VBJ48M
zWx6{%B=!Jnt&EAO6_e6)28&}l2%}USR)g<Od{b^CVjxR7Ua?B$c1ndv3SUV_V8Fwl
zaQhK=7gS@epRJ{9^b}NM*$UpVjClzwO8iQ`lbsb|*td0dMDDoh7$%lP8mq*_X3OD>
zC<P4bFkpo_KE5v)279Rzcs~2p_te&NhPLj!>1{0*l`Cp%Q3`F{`2lHbFY~VXYwxM8
z|HaVOT{pe0#Zf>jYHLvnZQb?3XzP(yRX|WGoU7-Af~z&BL!=tg&ZiCtql$p)ezIUu
zfkCFrL<P+8TmY1qdoXXJ#eM@tTL=_j?m^<?oFyFb70jc;8Uzd;6-soE3aYn+Qq^06
zm3d3x3VQAyL77F_g?<t4GSMt)Z4wq{Aeu`~6XytjN=zV4#>_cF6OOt*x8ZEn?>_Z4
zr|Jfw!bj&w`jCS=C1o;oA^afEquepkLSeU`%8(eHGHqZIh$M5|NAZn39$aw1;$Kn$
zTQWLs;FXv3Od9kqBTw(69%StFXXuTl2$1%UxtdV=>F(Dlx)GryyjJFj*THjhhMS8~
z7H?ZqG0NiEe}me624H!qJ9o@9FI+THX!q-V>BfCmitXgANh0GN7a8wVkR;y6jCWQS
zZoFgW!yS_N>V}v4(@k4mb!M8jWE!?)s<uv*WE<fn`xdx2g#%Rf#F<l>hJ7!eI@j{r
z#;@1BzWeKqUp_d!n|WD1m2TLVsoFnPLjG58R<5_;v9)sg0G;N3ApbOXh;ZmMck#P`
zW8L*M9O16TaAW#_$R+~FK@MSWY4uHKi>CLdq_*k(SiH?i>TgzRqj%F25WbL;meymq
zF}*)8Goq;D-K!o;<9;g~&hDfC$#x8lzgpuS*d_hdwi3X<_PHOhOMl%~LU6nLffDJz
zx=R4f&y~LyimN4@VK|mN8j?17GE>}TRz^<j8zg~Q-leTWaH71V-7??d7b-Se{~RHz
zkauZyYMg&uI{K+RW%2jmgoBL0*=4R`g%gev=7i%@2JWErnBXgq5E0G0Oq9j?Q_R;X
zLXn%p9q6S<qnUSMT_s9^JAiHHO@Axlwqr$aB}(C~U~ejvHrPF2P}-1zCc6whK+>4J
z3tuVLSXxhF4Obp72}w8>$$sG5q_jCkQ$M>x?qjW5STpNO{Db!Xa3;DI>pD6?ekDDK
zDb?_nragEpAfAd@iZ1P$?40{$dU9cDv>ohhSM-+PftGvAkXR>)w*&qfN}1!CD22Cz
zBxFTzRY=?EJ$55)x1coMvTE6H5~~=wT6IfWdhEU8q(}Bc#^XNH!7;!HHELw=`&V+T
zOUn@wPHl8tWq$w2J?zcK)K2ZG+ISMUfZkuO-Ht6Njn=MRu9W_L%jVSkrA|Ep@IChn
z<UPL=a`WTD5<id|m~4j~9+vn=Rn_}*tqquBMjx%FQZUR3Ud#!NM}Mr(3Cn($SjjD$
zQ*VJ)D(x(lo8)HUE4+LKcv`OYvG$i}CloVu6zu;&n*^vWM4Rx~hrLZ&a@vH)KJ0DM
zn$spc_F-=mCL<POhsQqDZL;5f=sT)>q&(O@$y`Embsmni8=Dv>!x851B5<$#z(4@r
zYPe%I_+N=WJr*4#uWbCVwL)Asv2B;!%HwemG35yY0|Z71oFwoNz<_c;1rHH8PT*Yv
zhXHbAK!o5-2DAqOgA52Fr?}O)(@Ix{Yy71u8-iAXv;rd*CL6L2K;Ot7N^Un0ehGiF
z)+ZK<h?wWlJar&%u&~-1+eZ_F@YlVwXZuJL-m1owo#%@nogik7Nrn7(eQo=>$QLHh
zwtQ8ds|luRf?wONN{3c8=R3*dTU|XUYij<|MmIVL&`i&E!v?Did%4L}L3xV6(*#Zv
zm?ZEFfhP%^BCw=Lh&K%Dh%Zy-uK)xq3}Pahj=4L|U!%}R39!EXG=*4KGg;ARDf%%2
ztZSd8&~pSnPT-RO>?ju16jT4hMMjm(@+POPvjG!<T~3Z~Qq!M<$na*k#jC2H^R=ga
z?HOO^lmr1wO~V%-|NP@;YyYV6D~)q4{ppteOv|PWxk!Q0czHv<2;><d%-{hMf#_)@
z0?}hU$3@6OhD9mlFvLXitkjNo@wZIfbIG@AI{MXzGQKq__nM2oHFNHqU)sIq3)@6m
zZj2{ibg!9?8gm$8BF9%BGUm`?7u+~AFz0c|g=c)%f7dIU{$ks!+x~2O#(!7Jeb*QI
zQ~tX$J_=s&-8JXV`K9T;NHfGPxbH$SekX(Qf|9fDyXXM5niqDa+@aJR_oX}TfAf)a
zM~Gq_p|meF=T?8S?$DW?Ja%3WW>`LvNd%C7E9-4Y`#MtY$5J~UNCyw69(*(%e2k*O
z$I`yX=G^LU7AZK7kwwoN!WouRWD^0T(>veFx@%w9m2y9l+ID}s^U#|o(w&b`tn-ny
z?~ysT`kQq>a%LBgo!5gImQQ37fs5`(u6lvRe4i9aod$w#<t-u%-zV?`0zV}1BLWRp
zfEEI42y_$JOkg{Kf2J%{hhYa-9o-;6`=XcwW7@&he%0=EG+u3OaolCSy3g7Q*K!A}
zcE`r6?sbm#pFV1BavZhtN~q`{suUOE?7BvJh{CpRDbh%BP|DU3)w)|Z#NUs3LV;_%
zL_Bs0FE_c1?yC;tl9*~^!*Dk;I(nMz;38W>&48bsVGy8ch8GWvCPrd|xjVZcDCDLX
zdr`HiDF$=hb@O+W$k@xd+#y$aN^_@B?4$E=NM8-j^HAYKwWF%=g}5qCyGm+r*y6g5
z2@30TEUp_qfEL#c1`(_2@>Uls%8f#Jw@Pj@2=BlY@yk$`sUXx;c-M@@=k!Rk!Ol3K
z+I|@^aZE{!;GpFsaG1MQ7e>Z(Sl_qt{6cn^1SlW9?#fk!%C}G~YwhDZO&s|uHl}xE
z)jv$tzea#<3i$?wh6qFf2KcF00gP?t<wEB{G{u$SI1$4)Gt=L2Uv~4NBb)CQVTQm9
z%l#9PB-)R<y!uL2Ro$d>MHV|v3k<~)J4aThq}8d~)oCxz!&QH?(rPmF)e{iDkdv0y
zW4M9hdASiqC4A?|H7EMwjf%3ps}|WgvM4QM4sdS%&Ji*tdyllNb{Se$cf+)-p3$;~
z99q`+0i<P(2KuucEz^W)E+$O7Sv%wcjFtrqw5(-iw5-2yT2@Cih3##4PXw)BBWP6L
zWf~WknSH@#QT;6>X#F=x(E4pyaTQF^`fuVvNjFB&)~2MjsoJ$^@7g&@{mn{iiJ<8T
z2w%uaOY1S*K=HiXh@uis(3HKzJ)%*>CD^j}dlTHPCS};1RY#;QE@9)qbf|=BarG0F
zA!#s>0heeG5x`c8dUld%gYhI$Lm6thyvwLvT{frC-Ux=NR3?T=KiyO4rww%pErb4O
zJXPr03=(@VOmS?d7@NWOPlbx_M1d8aARS=e-*WUhgT_MAo@=I2H-5j|dQ1@4BSIUc
zm9|(+^xb;h=4_#`z|{0Ed7Gondqk9i&Iy~}ROZwpx62*k_`I@Xg}1fP$>g^6*r#)t
z)6U6*Z1u!8CJ?)&ZGwx8J{5tt##nrCbYcjy%jhYbIuRRYo8y#dWbg#ij-L(~n6rXu
z{bj=U%Y6Xvg?UwWgGMlFuLLQ?==um%dV)ZV0PT4jP{I_95MV4Utx9zk7OUo6ln^Ii
zCqOn}23cS3R*5{(-?HXk`d+%{sM==<e4N0?2z-*j6ag{`TipDM@3&ywaP;X29Inz{
zI<}h4G6Cz|`XX0f92;2Zyh2?x-3*}7J}CV2kIMav?7MzT+w0aejRW!kp8x2a)Wm<Y
zQWIGkQ4B}nXu%BS5_v=b$@AWdlvIBv^gQH-_53$0)lVrrcwP^R+ekYT60vknOe7=I
zyH|_FCc26;Y@%b*lmYDEe~UWz`viVK;D-c$L|_k^Um;)13T<aJ6~XMH8w5zoktA9}
zBAA2Z+&>7hORHnuPY+nz9Q>3JmN8hLm5xuuM#p3EB*WBfU6^}#?};RnQH5kCJxY|h
z&CZsHB=}C{X&mkUmEMmDz_%cjmB!&MguInzixSCf@mOU1#AxieLbG$$KN1}e4^PlW
zpr@iZIvpqqHz`RzkZ71^i-%)Rk54F3h2}1WD5BCsfYcFu<n5-=zYt(DC-&9F6fGqH
zSq5HVJ3nz@r}E2a4*V=hT9RI|S*_N8vRHoI@(IuPET!z{{hp=dA1t;1WZ49va*_00
z)5n_LE-JECU9(w=%CAu9ic6$7Se<G*LRYqnbe~8^=*l{gUM<oQy1L$BZG5NvDVw$N
sZ41G7#)}WwtiHD`1mAhsd9uiQ)cUrC{dwo#IpS7p!&T^8th~eiFMcK4e*gdg

literal 0
HcmV?d00001

diff --git a/utils/get_configs.py b/utils/matrix-logic/get_full_sweep_configs.py
similarity index 100%
rename from utils/get_configs.py
rename to utils/matrix-logic/get_full_sweep_configs.py
diff --git a/utils/matrix-logic/test_get_full_sweep_configs.py b/utils/matrix-logic/test_get_full_sweep_configs.py
new file mode 100644
index 000000000..beee33aeb
--- /dev/null
+++ b/utils/matrix-logic/test_get_full_sweep_configs.py
@@ -0,0 +1,842 @@
+import pytest
+import json
+import yaml
+import tempfile
+import os
+from pathlib import Path
+from get_full_sweep_configs import main, seq_len_stoi
+
+
+@pytest.fixture
+def temp_config_dir(tmp_path):
+    """Create a temporary directory for config files."""
+    return tmp_path
+
+
+@pytest.fixture
+def valid_nvidia_config():
+    """Return a valid NVIDIA config structure."""
+    return {
+        "70b-fp4-b200-trt": {
+            "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
+            "model": "nvidia/Llama-3.3-70B-Instruct-FP4",
+            "runner": "b200-trt",
+            "precision": "fp4",
+            "framework": "trt",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "bmk-space": [
+                        {"tp": 1, "conc-start": 128, "conc-end": 128},
+                        {"tp": 2, "conc-start": 64, "conc-end": 128},
+                    ]
+                },
+                {
+                    "isl": 1024,
+                    "osl": 8192,
+                    "bmk-space": [
+                        {"tp": 4, "conc-start": 16, "conc-end": 128},
+                    ]
+                }
+            ]
+        }
+    }
+
+
+@pytest.fixture
+def valid_amd_config():
+    """Return a valid AMD config structure."""
+    return {
+        "70b-fp8-mi355x-vllm": {
+            "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
+            "model": "amd/Llama-3.3-70B-Instruct-FP8-KV",
+            "runner": "mi355x",
+            "precision": "fp8",
+            "framework": "vllm",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "bmk-space": [
+                        {"tp": 1, "conc-start": 32, "conc-end": 64},
+                    ]
+                }
+            ]
+        }
+    }
+
+
+@pytest.fixture
+def config_with_optional_fields():
+    """Return a config with optional ep and dp-attn fields."""
+    return {
+        "dsr1-fp4-b200-trt": {
+            "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
+            "model": "nvidia/DeepSeek-R1-0528-FP4-V2",
+            "runner": "b200-trt",
+            "precision": "fp4",
+            "framework": "trt",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "bmk-space": [
+                        {"tp": 4, "conc-start": 4, "conc-end": 32},
+                        {"tp": 4, "ep": 4, "conc-start": 64, "conc-end": 128},
+                        {"tp": 4, "ep": 4, "dp-attn": True, "conc-start": 256, "conc-end": 256},
+                    ]
+                }
+            ]
+        }
+    }
+
+
+def create_config_file(temp_dir, filename, config_data):
+    """Helper to create a YAML config file."""
+    config_path = temp_dir / filename
+    with open(config_path, 'w') as f:
+        yaml.dump(config_data, f)
+    return str(config_path)
+
+
+class TestMainFunction:
+    """Test suite for the main function."""
+
+    def test_basic_config_1k1k(self, temp_config_dir, valid_nvidia_config, monkeypatch, capsys):
+        """Test basic configuration with 1k1k sequence lengths."""
+        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        result = main()
+
+        # Verify output structure
+        assert isinstance(result, list)
+        assert len(result) == 3  # 1 config with 128 + 2 configs (64, 128)
+
+        # Verify all results have required fields
+        for entry in result:
+            assert 'image' in entry
+            assert 'model' in entry
+            assert 'precision' in entry
+            assert 'framework' in entry
+            assert 'runner' in entry
+            assert 'isl' in entry
+            assert 'osl' in entry
+            assert 'tp' in entry
+            assert 'conc' in entry
+            assert entry['isl'] == 1024
+            assert entry['osl'] == 1024
+
+        # Verify JSON output to stdout
+        captured = capsys.readouterr()
+        json_output = json.loads(captured.out.strip())
+        assert json_output == result
+
+    def test_multiple_config_files(self, temp_config_dir, valid_nvidia_config, valid_amd_config, monkeypatch):
+        """Test with multiple config files."""
+        nvidia_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
+        amd_file = create_config_file(temp_config_dir, "amd.yaml", valid_amd_config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', nvidia_file, amd_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        result = main()
+
+        # Should have entries from both configs
+        assert len(result) > 0
+        runners = {entry['runner'] for entry in result}
+        assert 'b200-trt' in runners
+        assert 'mi355x' in runners
+
+    def test_model_prefix_filtering(self, temp_config_dir, valid_nvidia_config, config_with_optional_fields, monkeypatch):
+        """Test that model prefix filtering works correctly."""
+        combined_config = {**valid_nvidia_config, **config_with_optional_fields}
+        config_file = create_config_file(temp_config_dir, "combined.yaml", combined_config)
+
+        # Filter for 70b only
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        result = main()
+
+        # Should only have 70b configs
+        assert all('70b' in list(combined_config.keys())[0] for entry in result)
+        assert len(result) == 3  # Only from 70b config
+
+        # Filter for dsr1 only
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', 'dsr1'
+        ])
+
+        result = main()
+
+        # Should only have dsr1 configs
+        # 3 bmk-space entries: [4,8,16,32] + [64,128] + [256] = 4+2+1 = 7 entries
+        assert len(result) == 7
+
+    def test_optional_fields_ep_and_dp_attn(self, temp_config_dir, config_with_optional_fields, monkeypatch):
+        """Test that optional ep and dp-attn fields are included when present."""
+        config_file = create_config_file(temp_config_dir, "config.yaml", config_with_optional_fields)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', 'dsr1'
+        ])
+
+        result = main()
+
+        # Check entries without optional fields
+        entries_without_ep = [e for e in result if 'ep' not in e]
+        assert len(entries_without_ep) > 0
+        for entry in entries_without_ep:
+            assert entry['conc'] <= 32
+
+        # Check entries with ep but without dp-attn
+        entries_with_ep_no_dp = [e for e in result if 'ep' in e and 'dp-attn' not in e]
+        assert len(entries_with_ep_no_dp) > 0
+        for entry in entries_with_ep_no_dp:
+            assert entry['ep'] == 4
+            assert 64 <= entry['conc'] <= 128
+
+        # Check entries with both ep and dp-attn
+        entries_with_both = [e for e in result if 'ep' in e and 'dp-attn' in e]
+        assert len(entries_with_both) > 0
+        for entry in entries_with_both:
+            assert entry['ep'] == 4
+            assert entry['dp-attn'] is True
+            assert entry['conc'] == 256
+
+    def test_step_size_default(self, temp_config_dir, valid_nvidia_config, monkeypatch):
+        """Test default step size of 2."""
+        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        result = main()
+
+        # For tp=2, conc-start=64, conc-end=128 with step=2
+        # Should generate: 64, 128
+        tp2_entries = [e for e in result if e['tp'] == 2]
+        tp2_concs = sorted([e['conc'] for e in tp2_entries])
+        assert tp2_concs == [64, 128]
+
+    def test_step_size_custom(self, temp_config_dir, valid_nvidia_config, monkeypatch):
+        """Test custom step size."""
+        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b',
+            '--step-size', '4'
+        ])
+
+        result = main()
+
+        # For tp=2, conc-start=64, conc-end=128 with step=4
+        # Should generate: 64, 128 (64*4=256 > 128, so stop at 128)
+        tp2_entries = [e for e in result if e['tp'] == 2]
+        tp2_concs = sorted([e['conc'] for e in tp2_entries])
+        assert tp2_concs == [64, 128]
+
+    def test_conc_range_single_value(self, temp_config_dir, monkeypatch):
+        """Test when conc-start equals conc-end."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "runner": "test-runner",
+                "precision": "fp8",
+                "framework": "vllm",
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "bmk-space": [
+                            {"tp": 1, "conc-start": 64, "conc-end": 64},
+                        ]
+                    }
+                ]
+            }
+        }
+        config_file = create_config_file(temp_config_dir, "config.yaml", config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', 'test'
+        ])
+
+        result = main()
+
+        assert len(result) == 1
+        assert result[0]['conc'] == 64
+
+    def test_different_seq_lens(self, temp_config_dir, valid_nvidia_config, monkeypatch):
+        """Test with different sequence length configurations."""
+        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
+
+        # Test 1k8k
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k8k',
+            '--model-prefix', '70b'
+        ])
+
+        result = main()
+
+        # Should match 1k8k config
+        assert all(e['isl'] == 1024 and e['osl'] == 8192 for e in result)
+        assert len(result) > 0
+
+    def test_no_matching_seq_lens(self, temp_config_dir, valid_nvidia_config, monkeypatch):
+        """Test when no configs match the requested sequence lengths."""
+        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '8k1k',  # Not in the config
+            '--model-prefix', '70b'
+        ])
+
+        result = main()
+
+        # Should return empty list
+        assert result == []
+
+    def test_no_matching_model_prefix(self, temp_config_dir, valid_nvidia_config, monkeypatch):
+        """Test when no configs match the model prefix."""
+        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', 'nonexistent'
+        ])
+
+        result = main()
+
+        # Should return empty list
+        assert result == []
+
+
+class TestErrorHandling:
+    """Test suite for error handling."""
+
+    def test_missing_config_file(self, temp_config_dir, monkeypatch):
+        """Test error when config file doesn't exist."""
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', '/nonexistent/file.yaml',
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        with pytest.raises(ValueError, match="does not exist"):
+            main()
+
+    def test_invalid_yaml(self, temp_config_dir, monkeypatch):
+        """Test error when YAML is invalid."""
+        config_path = temp_config_dir / "invalid.yaml"
+        with open(config_path, 'w') as f:
+            f.write("invalid: yaml: content: [")
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', str(config_path),
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        with pytest.raises(yaml.YAMLError):
+            main()
+
+    def test_non_dict_config(self, temp_config_dir, monkeypatch):
+        """Test error when config is not a dictionary."""
+        config_path = temp_config_dir / "list.yaml"
+        with open(config_path, 'w') as f:
+            yaml.dump(["not", "a", "dict"], f)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', str(config_path),
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        with pytest.raises(AssertionError, match="must contain a dictionary"):
+            main()
+
+    def test_duplicate_keys(self, temp_config_dir, monkeypatch):
+        """Test error when duplicate keys exist across config files."""
+        config1 = {
+            "70b-fp4-b200-trt": {
+                "image": "image1",
+                "model": "model1",
+                "runner": "runner1",
+                "precision": "fp4",
+                "framework": "trt",
+                "seq-len-configs": []
+            }
+        }
+        config2 = {
+            "70b-fp4-b200-trt": {  # Same key
+                "image": "image2",
+                "model": "model2",
+                "runner": "runner2",
+                "precision": "fp4",
+                "framework": "trt",
+                "seq-len-configs": []
+            }
+        }
+
+        file1 = create_config_file(temp_config_dir, "config1.yaml", config1)
+        file2 = create_config_file(temp_config_dir, "config2.yaml", config2)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', file1, file2,
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        with pytest.raises(ValueError, match="Duplicate configuration keys"):
+            main()
+
+    def test_missing_seq_len_configs(self, temp_config_dir, monkeypatch):
+        """Test error when seq-len-configs is missing."""
+        config = {
+            "70b-fp4-b200-trt": {
+                "image": "test-image",
+                "model": "test-model",
+                "runner": "test-runner",
+                "precision": "fp4",
+                "framework": "trt",
+                # Missing seq-len-configs
+            }
+        }
+        config_file = create_config_file(temp_config_dir, "config.yaml", config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        with pytest.raises(AssertionError, match="Missing 'seq-len-configs'"):
+            main()
+
+    def test_missing_required_fields(self, temp_config_dir, monkeypatch):
+        """Test error when required fields are missing."""
+        # Missing 'model' field
+        config = {
+            "70b-fp4-b200-trt": {
+                "image": "test-image",
+                # Missing model
+                "runner": "test-runner",
+                "precision": "fp4",
+                "framework": "trt",
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "bmk-space": [
+                            {"tp": 1, "conc-start": 64, "conc-end": 64}
+                        ]
+                    }
+                ]
+            }
+        }
+        config_file = create_config_file(temp_config_dir, "config.yaml", config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        with pytest.raises(AssertionError, match="Missing required fields"):
+            main()
+
+    def test_missing_bmk_space(self, temp_config_dir, monkeypatch):
+        """Test error when bmk-space is missing."""
+        config = {
+            "70b-fp4-b200-trt": {
+                "image": "test-image",
+                "model": "test-model",
+                "runner": "test-runner",
+                "precision": "fp4",
+                "framework": "trt",
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        # Missing bmk-space
+                    }
+                ]
+            }
+        }
+        config_file = create_config_file(temp_config_dir, "config.yaml", config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        with pytest.raises(AssertionError, match="Missing 'bmk-space'"):
+            main()
+
+    def test_missing_bmk_space_fields(self, temp_config_dir, monkeypatch):
+        """Test error when tp, conc-start, or conc-end is missing."""
+        config = {
+            "70b-fp4-b200-trt": {
+                "image": "test-image",
+                "model": "test-model",
+                "runner": "test-runner",
+                "precision": "fp4",
+                "framework": "trt",
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "bmk-space": [
+                            {"tp": 1, "conc-start": 64}  # Missing conc-end
+                        ]
+                    }
+                ]
+            }
+        }
+        config_file = create_config_file(temp_config_dir, "config.yaml", config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        with pytest.raises(AssertionError, match="Missing 'tp', 'conc-start', or 'conc-end'"):
+            main()
+
+
+class TestEdgeCases:
+    """Test suite for edge cases."""
+
+    def test_empty_config(self, temp_config_dir, monkeypatch):
+        """Test with empty config file."""
+        config = {}
+        config_file = create_config_file(temp_config_dir, "empty.yaml", config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        result = main()
+
+        # Should return empty list
+        assert result == []
+
+    def test_large_conc_range(self, temp_config_dir, monkeypatch):
+        """Test with large concurrency range."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "runner": "test-runner",
+                "precision": "fp8",
+                "framework": "vllm",
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "bmk-space": [
+                            {"tp": 1, "conc-start": 4, "conc-end": 1024},
+                        ]
+                    }
+                ]
+            }
+        }
+        config_file = create_config_file(temp_config_dir, "config.yaml", config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', 'test'
+        ])
+
+        result = main()
+
+        # With step=2: 4, 8, 16, 32, 64, 128, 256, 512, 1024
+        concs = sorted([e['conc'] for e in result])
+        assert concs == [4, 8, 16, 32, 64, 128, 256, 512, 1024]
+
+    def test_conc_end_not_power_of_step(self, temp_config_dir, monkeypatch):
+        """Test when conc-end is not a power of step size."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "runner": "test-runner",
+                "precision": "fp8",
+                "framework": "vllm",
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "bmk-space": [
+                            {"tp": 1, "conc-start": 10, "conc-end": 100},
+                        ]
+                    }
+                ]
+            }
+        }
+        config_file = create_config_file(temp_config_dir, "config.yaml", config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', 'test'
+        ])
+
+        result = main()
+
+        # With step=2: 10, 20, 40, 80, 100 (last value is conc-end)
+        concs = sorted([e['conc'] for e in result])
+        assert concs == [10, 20, 40, 80, 100]
+        assert concs[-1] == 100  # Should always include conc-end
+
+    def test_all_seq_lens_in_stoi(self):
+        """Test that all defined seq_lens work correctly."""
+        assert seq_len_stoi["1k1k"] == (1024, 1024)
+        assert seq_len_stoi["1k8k"] == (1024, 8192)
+        assert seq_len_stoi["8k1k"] == (8192, 1024)
+
+    def test_multiple_bmk_space_entries(self, temp_config_dir, monkeypatch):
+        """Test with multiple bmk-space entries."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "runner": "test-runner",
+                "precision": "fp8",
+                "framework": "vllm",
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "bmk-space": [
+                            {"tp": 1, "conc-start": 32, "conc-end": 64},
+                            {"tp": 2, "conc-start": 16, "conc-end": 32},
+                            {"tp": 4, "conc-start": 8, "conc-end": 16},
+                        ]
+                    }
+                ]
+            }
+        }
+        config_file = create_config_file(temp_config_dir, "config.yaml", config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', 'test'
+        ])
+
+        result = main()
+
+        # Verify all tp values are present
+        tp_values = sorted(set(e['tp'] for e in result))
+        assert tp_values == [1, 2, 4]
+
+        # Verify correct conc ranges for each tp
+        tp1_concs = sorted([e['conc'] for e in result if e['tp'] == 1])
+        tp2_concs = sorted([e['conc'] for e in result if e['tp'] == 2])
+        tp4_concs = sorted([e['conc'] for e in result if e['tp'] == 4])
+
+        assert tp1_concs == [32, 64]
+        assert tp2_concs == [16, 32]
+        assert tp4_concs == [8, 16]
+
+    def test_output_format(self, temp_config_dir, valid_nvidia_config, monkeypatch, capsys):
+        """Test that output is valid JSON and matches expected format."""
+        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', '70b'
+        ])
+
+        result = main()
+
+        # Capture stdout
+        captured = capsys.readouterr()
+
+        # Verify it's valid JSON
+        json_output = json.loads(captured.out.strip())
+
+        # Verify it matches the result
+        assert json_output == result
+
+        # Verify each entry has the correct structure
+        for entry in json_output:
+            assert isinstance(entry, dict)
+            assert all(isinstance(k, str) for k in entry.keys())
+            assert entry['image'] == valid_nvidia_config['70b-fp4-b200-trt']['image']
+            assert entry['model'] == valid_nvidia_config['70b-fp4-b200-trt']['model']
+            assert entry['precision'] == valid_nvidia_config['70b-fp4-b200-trt']['precision']
+            assert entry['framework'] == valid_nvidia_config['70b-fp4-b200-trt']['framework']
+            assert entry['runner'] == valid_nvidia_config['70b-fp4-b200-trt']['runner']
+
+
+class TestConcurrencyGeneration:
+    """Test suite specifically for concurrency value generation logic."""
+
+    def test_conc_progression_step_2(self, temp_config_dir, monkeypatch):
+        """Test concurrency progression with step size 2."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "runner": "test-runner",
+                "precision": "fp8",
+                "framework": "vllm",
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "bmk-space": [
+                            {"tp": 1, "conc-start": 1, "conc-end": 16},
+                        ]
+                    }
+                ]
+            }
+        }
+        config_file = create_config_file(temp_config_dir, "config.yaml", config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', 'test',
+            '--step-size', '2'
+        ])
+
+        result = main()
+
+        # Should multiply by 2 each time: 1, 2, 4, 8, 16
+        concs = sorted([e['conc'] for e in result])
+        assert concs == [1, 2, 4, 8, 16]
+
+    def test_conc_progression_step_3(self, temp_config_dir, monkeypatch):
+        """Test concurrency progression with step size 3."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "runner": "test-runner",
+                "precision": "fp8",
+                "framework": "vllm",
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "bmk-space": [
+                            {"tp": 1, "conc-start": 2, "conc-end": 100},
+                        ]
+                    }
+                ]
+            }
+        }
+        config_file = create_config_file(temp_config_dir, "config.yaml", config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', 'test',
+            '--step-size', '3'
+        ])
+
+        result = main()
+
+        # Should multiply by 3 each time: 2, 6, 18, 54, 100
+        concs = sorted([e['conc'] for e in result])
+        assert concs == [2, 6, 18, 54, 100]
+
+    def test_conc_exact_end_value(self, temp_config_dir, monkeypatch):
+        """Test that conc-end is always included even if not reached by progression."""
+        config = {
+            "test-config": {
+                "image": "test-image",
+                "model": "test-model",
+                "runner": "test-runner",
+                "precision": "fp8",
+                "framework": "vllm",
+                "seq-len-configs": [
+                    {
+                        "isl": 1024,
+                        "osl": 1024,
+                        "bmk-space": [
+                            {"tp": 1, "conc-start": 5, "conc-end": 50},
+                        ]
+                    }
+                ]
+            }
+        }
+        config_file = create_config_file(temp_config_dir, "config.yaml", config)
+
+        monkeypatch.setattr('sys.argv', [
+            'script.py',
+            '--config-files', config_file,
+            '--seq-lens', '1k1k',
+            '--model-prefix', 'test',
+            '--step-size', '2'
+        ])
+
+        result = main()
+
+        concs = sorted([e['conc'] for e in result])
+        # 5, 10, 20, 40, 50 (40*2=80 > 50, so we include 50)
+        assert concs[-1] == 50
+        assert 50 in concs

From fdb94fab8a835d5603a0629a038fd15da101701c Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 12:34:06 -0500
Subject: [PATCH 031/149] adding other isl osl

---
 .github/workflows/1k1k-sweep.yml | 128 ++++++++++++++-------------
 .github/workflows/1k8k-sweep.yml | 145 +++++++++++++++++++++++++++++++
 .github/workflows/8k1k-sweep.yml | 145 +++++++++++++++++++++++++++++++
 3 files changed, 356 insertions(+), 62 deletions(-)
 create mode 100644 .github/workflows/1k8k-sweep.yml
 create mode 100644 .github/workflows/8k1k-sweep.yml

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index ee1c8ddd2..768d278f5 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -1,5 +1,9 @@
 name: "1K/1K Sweep"
 
+concurrency:
+  group: benchmark-lock-1k1k
+  cancel-in-progress: false
+
 on:
     pull_request:
     workflow_dispatch:
@@ -28,7 +32,7 @@ jobs:
 
             - id: get-dsr1-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-gptoss-configs:
@@ -41,56 +45,56 @@ jobs:
 
             - id: get-gptoss-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/get_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
-    # benchmark-70b:
-    #     needs: get-70b-configs
-    #     uses: ./.github/workflows/benchmark-tmpl.yml
-    #     name: 70b 1k1k
-    #     strategy:
-    #         fail-fast: false
-    #         matrix:
-    #             config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }}
-    #     secrets: inherit
-    #     with:
-    #         exp-name: "70b_1k1k"
-    #         isl: 1024
-    #         osl: 1024
-    #         max-model-len: 2048
-    #         runner: ${{ matrix.config.runner }}
-    #         image: ${{ matrix.config.image }}
-    #         model: ${{ matrix.config.model }}
-    #         framework: ${{ matrix.config.framework }}
-    #         precision: ${{ matrix.config.precision }}
-    #         tp: ${{ matrix.config.tp }}
-    #         ep: ${{ matrix.config.ep || 1 }}
-    #         dp-attn: ${{ matrix.config.dp-attn || false }}
-    #         conc: ${{ matrix.config.conc }}
+    benchmark-70b:
+        needs: get-70b-configs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: 70b 1k1k
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "70b_1k1k"
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
 
-    # benchmark-dsr1:
-    #     needs: get-dsr1-configs
-    #     uses: ./.github/workflows/benchmark-tmpl.yml
-    #     name: dsr1 1k1k
-    #     strategy:
-    #         fail-fast: false
-    #         matrix:
-    #             config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }}
-    #     secrets: inherit
-    #     with:
-    #         exp-name: "dsr1_1k1k"
-    #         isl: 1024
-    #         osl: 1024
-    #         max-model-len: 2048
-    #         runner: ${{ matrix.config.runner }}
-    #         image: ${{ matrix.config.image }}
-    #         model: ${{ matrix.config.model }}
-    #         framework: ${{ matrix.config.framework }}
-    #         precision: ${{ matrix.config.precision }}
-    #         tp: ${{ matrix.config.tp }}
-    #         ep: ${{ matrix.config.ep || 1 }}
-    #         dp-attn: ${{ matrix.config.dp-attn || false }}
-    #         conc: ${{ matrix.config.conc }}
+    benchmark-dsr1:
+        needs: get-dsr1-configs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: dsr1 1k1k
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k1k"
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
 
     benchmark-gptoss:
         needs: get-gptoss-configs
@@ -116,21 +120,21 @@ jobs:
             dp-attn: ${{ matrix.config.dp-attn || false }}
             conc: ${{ matrix.config.conc }}
 
-    # collect-70b-results:
-    #     needs: benchmark-70b
-    #     if: ${{ always() }}
-    #     uses: ./.github/workflows/collect-results.yml
-    #     secrets: inherit
-    #     with:
-    #         exp-name: "70b_1k1k"
+    collect-70b-results:
+        needs: benchmark-70b
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "70b_1k1k"
 
-    # collect-dsr1-results:
-    #     needs: benchmark-dsr1
-    #     if: ${{ always() }}
-    #     uses: ./.github/workflows/collect-results.yml
-    #     secrets: inherit
-    #     with:
-    #         exp-name: "dsr1_1k1k"
+    collect-dsr1-results:
+        needs: benchmark-dsr1
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k1k"
 
     collect-gptoss-results:
         needs: benchmark-gptoss
diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
new file mode 100644
index 000000000..da747e3ed
--- /dev/null
+++ b/.github/workflows/1k8k-sweep.yml
@@ -0,0 +1,145 @@
+name: "1K/8K Sweep"
+
+concurrency:
+  group: benchmark-lock-1k8k
+  cancel-in-progress: false
+
+on:
+    # pull_request:
+    workflow_dispatch:
+
+jobs:
+    get-70b-configs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - id: get-70b-configs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix 70b)
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+    get-dsr1-configs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - id: get-dsr1-configs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+    get-gptoss-configs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - id: get-gptoss-configs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+    benchmark-70b:
+        needs: get-70b-configs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: 70b 1k8k
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "70b_1k8k"
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
+
+    benchmark-dsr1:
+        needs: get-dsr1-configs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: dsr1 1k8k
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k8k"
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
+
+    benchmark-gptoss:
+        needs: get-gptoss-configs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: gptoss 1k8k
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "gptoss_1k8k"
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
+
+    collect-70b-results:
+        needs: benchmark-70b
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "70b_1k8k"
+
+    collect-dsr1-results:
+        needs: benchmark-dsr1
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k8k"
+
+    collect-gptoss-results:
+        needs: benchmark-gptoss
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_1k8k"
diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml
new file mode 100644
index 000000000..d5ffc3f43
--- /dev/null
+++ b/.github/workflows/8k1k-sweep.yml
@@ -0,0 +1,145 @@
+name: "8K/1K Sweep"
+
+concurrency:
+  group: benchmark-lock-8k1k
+  cancel-in-progress: false
+
+on:
+    # pull_request:
+    workflow_dispatch:
+
+jobs:
+    get-70b-configs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - id: get-70b-configs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix 70b)
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+    get-dsr1-configs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - id: get-dsr1-configs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1)
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+    get-gptoss-configs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - id: get-gptoss-configs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss)
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+    benchmark-70b:
+        needs: get-70b-configs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: 70b 8k1k
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "70b_8k1k"
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
+
+    benchmark-dsr1:
+        needs: get-dsr1-configs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: dsr1 8k1k
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "dsr1_8k1k"
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
+
+    benchmark-gptoss:
+        needs: get-gptoss-configs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: gptoss 8k1k
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "gptoss_8k1k"
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
+
+    collect-70b-results:
+        needs: benchmark-70b
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "70b_8k1k"
+
+    collect-dsr1-results:
+        needs: benchmark-dsr1
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_8k1k"
+
+    collect-gptoss-results:
+        needs: benchmark-gptoss
+        if: ${{ always() }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_8k1k"

From d339b8f44eb2ef77df349f709a906e3d70bd523a Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 14:42:41 -0500
Subject: [PATCH 032/149] adding more workflows

---
 .github/workflows/1k1k-sweep.yml             |   6 +-
 .github/workflows/1k8k-sweep.yml             |   5 +
 .github/workflows/70b-tmpl.yml               | 230 ----------------
 .github/workflows/8k1k-sweep.yml             |   2 +
 .github/workflows/dsr1-tmpl.yml              | 265 -------------------
 .github/workflows/gptoss-tmpl.yml            | 176 ------------
 .github/workflows/test.yml                   | 147 ++++++++++
 utils/matrix-logic/get_test_sweep_configs.py | 151 +++++++++++
 8 files changed, 309 insertions(+), 673 deletions(-)
 delete mode 100644 .github/workflows/70b-tmpl.yml
 delete mode 100644 .github/workflows/dsr1-tmpl.yml
 delete mode 100644 .github/workflows/gptoss-tmpl.yml
 create mode 100644 .github/workflows/test.yml
 create mode 100644 utils/matrix-logic/get_test_sweep_configs.py

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 768d278f5..58ee3131c 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -1,12 +1,14 @@
 name: "1K/1K Sweep"
 
 concurrency:
-  group: benchmark-lock-1k1k
-  cancel-in-progress: false
+    group: benchmark-lock-1k1k
+    cancel-in-progress: false
 
 on:
     pull_request:
     workflow_dispatch:
+#   schedule:
+#     - cron: '0 23 * * *'
 
 jobs:
     get-70b-configs:
diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index da747e3ed..5a89e54b2 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -4,6 +4,11 @@ concurrency:
   group: benchmark-lock-1k8k
   cancel-in-progress: false
 
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 23 * * *'
+
 on:
     # pull_request:
     workflow_dispatch:
diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
deleted file mode 100644
index 3d1dd5051..000000000
--- a/.github/workflows/70b-tmpl.yml
+++ /dev/null
@@ -1,230 +0,0 @@
-name: Template - LLaMA 70B
-
-on:
-  workflow_call:
-    inputs:
-      exp-name:
-        required: true
-        type: string
-      isl:
-        required: true
-        type: string
-      osl:
-        required: true
-        type: string
-      max-model-len:
-        required: true
-        type: string
-      random-range-ratio:
-        required: true
-        type: string
-
-      use_h100:
-        type: boolean
-        required: true
-      use_h200:
-        type: boolean
-        required: true
-      use_b200:
-        type: boolean
-        required: true
-      use_mi300x:
-        type: boolean
-        required: true
-      use_mi325x:
-        type: boolean
-        required: true
-      use_mi355x:
-        type: boolean
-        required: true
-
-jobs:
-  bmk-h100-fp8:
-    if: ${{ inputs.use_h100 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: h100
-      image: 'vllm/vllm-openai:v0.10.2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      framework: 'vllm'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[2, 4, 8]'
-
-  bmk-h200-fp8:
-    if: ${{ inputs.use_h200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: h200
-      image: 'vllm/vllm-openai:v0.10.2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      framework: 'vllm'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
-
-  bmk-h200-trt-fp8:
-    if: ${{ inputs.use_h200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: h200-trt
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      framework: 'trt'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'  
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # H200 can achieve TPS/User >= 30 with larger concurrency till 128
-
-  bmk-b200-fp8:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: b200
-      image: 'vllm/vllm-openai:v0.10.2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      framework: 'vllm'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
-
-  bmk-b200-trt-fp8:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: b200-trt
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      framework: 'trt'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # B200 can achieve TPS/User >= 30 with larger concurrency till 256
-
-  bmk-mi300x-fp8:
-    if: ${{ inputs.use_mi300x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: mi300x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-      framework: 'vllm'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
-
-  bmk-mi325x-fp8:
-    if: ${{ inputs.use_mi325x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: mi325x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-      framework: 'vllm'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
-
-  bmk-mi355x-fp8:
-    if: ${{ inputs.use_mi355x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: mi355x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-      framework: 'vllm'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
-
-  bmk-b200-fp4:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: b200
-      image: 'vllm/vllm-openai:v0.10.2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP4'
-      framework: 'vllm'
-      precision: 'fp4'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'  # fix: add TP=2,4 to B200, just as mi355 has
-
-  bmk-b200-trt-fp4:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: b200-trt
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP4'
-      framework: 'trt'
-      precision: 'fp4'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # B200 can achieve TPS/User >= 30 with larger concurrency till 128
-
-  bmk-mi355x-fp4:
-    if: ${{ inputs.use_mi355x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: mi355x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-      model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview'
-      framework: 'vllm'
-      precision: 'fp4'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml
index d5ffc3f43..9dc28c52b 100644
--- a/.github/workflows/8k1k-sweep.yml
+++ b/.github/workflows/8k1k-sweep.yml
@@ -7,6 +7,8 @@ concurrency:
 on:
     # pull_request:
     workflow_dispatch:
+#   schedule:
+#     - cron: '0 23 * * *'
 
 jobs:
     get-70b-configs:
diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml
deleted file mode 100644
index 3a48710f2..000000000
--- a/.github/workflows/dsr1-tmpl.yml
+++ /dev/null
@@ -1,265 +0,0 @@
-name: Template - DeepSeek R1
-
-on:
-  workflow_call:
-    inputs:
-      exp-name:
-        required: true
-        type: string
-      isl:
-        required: true
-        type: string
-      osl:
-        required: true
-        type: string
-      max-model-len:
-        required: true
-        type: string
-      random-range-ratio:
-        required: true
-        type: string
-
-      use_h200:
-        type: boolean
-        required: true
-      use_b200:
-        type: boolean
-        required: true
-      use_mi300x:
-        type: boolean
-        required: true
-      use_mi325x:
-        type: boolean
-        required: true
-      use_mi355x:
-        type: boolean
-        required: true
-      use_gb200:
-        type: boolean
-        required: false
-        default: false
-
-jobs:
-  bmk-h200-fp8:
-    if: ${{ inputs.use_h200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: h200
-      image: 'lmsysorg/sglang:v0.5.2rc2-cu126'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      framework: 'sglang'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
-
-  bmk-h200-trt-fp8:
-    if: ${{ inputs.use_h200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: h200-trt
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      framework: 'trt'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
-
-  bmk-b200-fp8:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: b200
-      image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      framework: 'sglang'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
-
-  bmk-b200-trt-fp8:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: b200-trt
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      framework: 'trt'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
-
-  bmk-mi300x-fp8:
-    if: ${{ inputs.use_mi300x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: mi300x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      framework: 'sglang'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
-
-  bmk-mi325x-fp8:
-    if: ${{ inputs.use_mi325x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: mi325x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      framework: 'sglang'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
-
-  bmk-mi355x-fp8:
-    if: ${{ inputs.use_mi355x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: mi355x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      framework: 'sglang'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
-
-  bmk-b200-fp4:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: b200
-      image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200'
-      model: 'nvidia/DeepSeek-R1-0528-FP4'
-      framework: 'sglang'
-      precision: 'fp4'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[4,8]'
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # Custom concurrency values for this job
-
-  bmk-b200-trt-fp4:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: b200-trt
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
-      model: 'nvidia/DeepSeek-R1-0528-FP4-v2'
-      framework: 'trt'
-      precision: fp4
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[4, 8]'
-      conc-list: '[4, 8, 16, 32, 64, 128, 256]'  # DPA4EP4 is already 30 tok/s/user and DPA8EP8 is already 35tok/s/user. 512 conc would be too much so we skipping it
-
-  bmk-mi355x-fp4:
-    if: ${{ inputs.use_mi355x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: mi355x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915'
-      framework: 'sglang'
-      precision: 'fp4'
-      model: 'amd/DeepSeek-R1-0528-MXFP4-Preview'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      # These tensor parallelism settings are not necessary as they cannot fall on the Pareto frontier with this particular container - we remove them to save CI time.
-      tp-list: ${{ inputs.isl == 1024 && inputs.osl == 1024 && '[4, 8]' || '[8]' }}
-
-  bmk-gb200-fp4-multinode-mtp-off:
-    if: ${{ inputs.use_gb200 && !(inputs.isl == '1024' && inputs.osl == '8192') }}
-    uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-    secrets: inherit
-    with:
-      runner: gb200
-      image: 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3'
-      model: 'deepseek-r1-fp4'
-      framework: 'dynamo-trtllm'
-      precision: 'fp4'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      mtp-mode: 'off'
-
-  bmk-gb200-fp4-multinode-mtp-on:
-    if: ${{ inputs.use_gb200 && !(inputs.isl == '1024' && inputs.osl == '8192') }}
-    uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-    secrets: inherit
-    with:
-      runner: gb200
-      image: 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3'
-      model: 'deepseek-r1-fp4'
-      framework: 'dynamo-trtllm'
-      precision: 'fp4'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      mtp-mode: 'on'
-
-  bmk-gb200-fp8-multinode:
-    if: ${{ inputs.use_gb200 && !(inputs.isl == '1024' && inputs.osl == '8192') }}
-    uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-    secrets: inherit
-    with:
-      runner: gb200
-      image: 'nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      framework: 'dynamo-sglang'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      mtp-mode: 'off'
diff --git a/.github/workflows/gptoss-tmpl.yml b/.github/workflows/gptoss-tmpl.yml
deleted file mode 100644
index 8bb8d13a6..000000000
--- a/.github/workflows/gptoss-tmpl.yml
+++ /dev/null
@@ -1,176 +0,0 @@
-name: Template - gpt-oss
-
-on:
-  workflow_call:
-    inputs:
-      exp-name:
-        required: true
-        type: string
-      isl:
-        required: true
-        type: string
-      osl:
-        required: true
-        type: string
-      max-model-len:
-        required: true
-        type: string
-      random-range-ratio:
-        required: true
-        type: string
-
-      use_h100:
-        type: boolean
-        required: true
-      use_h200:
-        type: boolean
-        required: true
-      use_b200:
-        type: boolean
-        required: true
-      use_mi300x:
-        type: boolean
-        required: true
-      use_mi325x:
-        type: boolean
-        required: true
-      use_mi355x:
-        type: boolean
-        required: true
-
-jobs:
-  bmk-h100:
-    if: ${{ inputs.use_h100 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: h100
-      image: 'vllm/vllm-openai:v0.10.2'
-      model: 'openai/gpt-oss-120b'
-      tp-list: '[2, 4, 8]'
-      framework: 'vllm'
-      precision: 'fp4'
-
-  bmk-h200:
-    if: ${{ inputs.use_h200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: h200
-      image: 'vllm/vllm-openai:v0.10.2'
-      model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
-      framework: 'vllm'
-      precision: 'fp4'
-
-  bmk-b200:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: b200
-      image: 'vllm/vllm-openai:v0.10.2'
-      model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
-      framework: 'vllm'
-      precision: 'fp4'
-
-  bmk-b200-trt:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: b200-nvs
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1'
-      model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
-      framework: 'trt'
-      precision: 'fp4'
-
-  bmk-h200-trt:
-    if: ${{ inputs.use_h200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: h200-trt
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev'
-      model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
-      framework: 'trt'
-      precision: 'fp4'
-
-  bmk-mi300x:
-    if: ${{ inputs.use_mi300x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi300x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-      model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
-      framework: 'vllm'
-      precision: 'fp4'
-
-  bmk-mi325x:
-    if: ${{ inputs.use_mi325x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi325x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-      model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
-      framework: 'vllm'
-      precision: 'fp4'
-
-  bmk-mi355x:
-    if: ${{ inputs.use_mi355x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      runner: mi355x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-      model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 4, 8]'
-      framework: 'vllm'
-      precision: 'fp4'
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 000000000..0d92952da
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,147 @@
+name: Test - Full Sweep
+
+concurrency:
+    group: benchmark-lock
+    cancel-in-progress: false
+
+on:
+    pull_request:
+    workflow_dispatch:
+        inputs:
+            name:
+                description: "Name of benchmark from master configs"
+                required: true
+                type: string
+                default: 70b-fp4-mi355x-vllm
+
+            run_1k1k:
+                description: "Run ISL/OSL 1k/1k"
+                type: boolean
+                required: true
+            run_1k8k:
+                description: "Run ISL/OSL 1k/8k"
+                type: boolean
+                required: true
+            run_8k1k:
+                description: "Run ISL/OSL 8k/1k"
+                type: boolean
+                required: true
+
+            runner:
+                description: "Specific runner node to run on"
+                required: false
+                type: choice
+                options:
+                    - "h100-cr_0"
+                    - "h100-cr_1"
+                    - "h100-cw_0"
+                    - "h100-cw_1"
+                    - "h200-cw_0"
+                    - "h200-cw_1"
+                    - "h200-nb_0"
+                    - "h200-nb_1"
+                    - "h200-nb_2"
+                    - "h200-nb_3"
+                    - "h200-nv_0"
+                    - "h200-nv_1"
+                    - "h200-nv_2"
+                    - "h200-nv_3"
+                    - "b200-nv_0"
+                    - "b200-nv_1"
+                    - "b200-nb_0"
+                    - "b200-nb_1"
+                    - "b200-nvd_0"
+                    - "b200-nvd_1"
+                    - "b200-nvd_2"
+                    - "b200-nvd_3"
+                    - "b200-tg_0"
+                    - "mi300x-amd_0"
+                    - "mi300x-amd_1"
+                    - "mi300x-amd_2"
+                    - "mi300x-amd_3"
+                    - "mi300x-amd_4"
+                    - "mi300x-cr_0"
+                    - "mi300x-oci_0"
+                    - "mi325x-amd_0"
+                    - "mi325x-tw_0"
+                    - "mi325x-tw_1"
+                    - "mi325x-tw_2"
+                    - "mi325x-tw_3"
+                    - "mi355x-amd_0"
+                    - "mi355x-amd_1"
+                    - "mi355x-amd_2"
+                    - "mi355x-amd_3"
+
+jobs:
+    get-jobs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - id: get-jobs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \
+                    --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \
+                    --key ${{ inputs.name }} \
+                    ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }})
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+    test-sweep:
+        needs: get-jobs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: test sweep - ${{ inputs.name }}
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k1k"
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
+
+    calc-success-rate:
+        needs: test-sweep
+        if: ${{ always() }}
+        runs-on: ubuntu-latest
+
+        env:
+            RESULTS_DIR: "results/"
+            STATS_FILENAME: "run_stats"
+            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
+
+        steps:
+            - uses: actions/checkout@v3
+              with:
+                  token: ${{ secrets.REPO_PAT }}
+                  fetch-depth: 0
+
+            - name: Download results artifacts
+              uses: actions/download-artifact@v4
+              with:
+                  path: ${{ env.RESULTS_DIR }}
+                  pattern: results_*
+
+            - name: Install python dependencies
+              run: pip install PyGithub
+
+            - name: Calculate success rate
+              run: python3 utils/calc_success_rate.py $STATS_FILENAME
+
+            - uses: actions/upload-artifact@v4
+              with:
+                  name: "run-stats"
+                  path: ${{ env.STATS_FILENAME }}.json
diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py
new file mode 100644
index 000000000..87ab0457b
--- /dev/null
+++ b/utils/matrix-logic/get_test_sweep_configs.py
@@ -0,0 +1,151 @@
+import json
+import yaml
+import sys
+import argparse
+
+seq_len_stoi = {
+    "1k1k": (1024, 1024),
+    "1k8k": (1024, 8192),
+    "8k1k": (8192, 1024)
+}
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate benchmark matrix from a specific configuration key'
+    )
+    parser.add_argument(
+        '--config-files',
+        nargs='+',
+        required=True,
+        help='One or more configuration files (YAML format)'
+    )
+    parser.add_argument(
+        '--key',
+        required=True,
+        help='Configuration key to use'
+    )
+    parser.add_argument(
+        '--seq-lens',
+        nargs='+',
+        choices=list(seq_len_stoi.keys()),
+        required=False,
+        help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
+    )
+    parser.add_argument(
+        '--step-size',
+        type=int,
+        default=2,
+        help='Step size for concurrency values (default: 2)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Convert seq-lens to set of (isl, osl) tuples for filtering
+    seq_lens_filter = None
+    if args.seq_lens:
+        seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens}
+    
+    # Load and merge all config files
+    all_config_data = {}
+    for config_file in args.config_files:
+        try:
+            with open(config_file, 'r') as f:
+                config_data = yaml.safe_load(f)
+                assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary"
+                
+                # Check for duplicate keys
+                duplicate_keys = set(all_config_data.keys()) & set(config_data.keys())
+                if duplicate_keys:
+                    raise ValueError(
+                        f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
+                    )
+                
+                all_config_data.update(config_data)
+        except FileNotFoundError:
+            raise ValueError(f"Input file '{config_file}' does not exist.")
+    
+    # Check if the key exists
+    if args.key not in all_config_data:
+        available_keys = ', '.join(sorted(all_config_data.keys()))
+        raise ValueError(
+            f"Key '{args.key}' not found in configuration files. "
+            f"Available keys: {available_keys}"
+        )
+    
+    val = all_config_data[args.key]
+    
+    # Validate required fields
+    seq_len_configs = val.get('seq-len-configs')
+    assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'"
+    
+    image = val.get('image')
+    model = val.get('model')
+    precision = val.get('precision')
+    framework = val.get('framework')
+    runner = val.get('runner')
+    
+    assert None not in (image, model, precision, framework, runner), \
+        f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'"
+    
+    matrix_values = []
+    
+    # Process each sequence length configuration
+    for seq_config in seq_len_configs:
+        isl = seq_config.get('isl')
+        osl = seq_config.get('osl')
+        
+        assert None not in (isl, osl), \
+            f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'"
+        
+        # Filter by sequence lengths if specified
+        if seq_lens_filter and (isl, osl) not in seq_lens_filter:
+            continue
+        
+        bmk_space = seq_config.get('bmk-space')
+        assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'"
+        
+        for bmk in bmk_space:
+            tp = bmk.get('tp')
+            conc_start = bmk.get('conc-start')
+            conc_end = bmk.get('conc-end')
+            ep = bmk.get('ep')
+            dp_attn = bmk.get('dp-attn')
+            
+            assert None not in (tp, conc_start, conc_end), \
+                f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'"
+            
+            # Generate entries for each concurrency value in the range
+            conc = conc_start
+            while conc <= conc_end:
+                entry = {
+                    'image': image,
+                    'model': model,
+                    'precision': precision,
+                    'framework': framework,
+                    'runner': runner,
+                    'isl': isl,
+                    'osl': osl,
+                    'tp': tp,
+                    'conc': conc,
+                    'max-model-len': isl + osl,
+                }
+                
+                # Add optional fields if they exist
+                if ep is not None:
+                    entry['ep'] = ep
+                if dp_attn is not None:
+                    entry['dp-attn'] = dp_attn
+                
+                matrix_values.append(entry)
+                
+                if conc == conc_end:
+                    break
+                conc *= args.step_size
+                if conc > conc_end:
+                    conc = conc_end
+    
+    print(json.dumps(matrix_values))
+    return matrix_values
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 2b284f9203c529b38c4d953312fe5803404cf68d Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 14:43:37 -0500
Subject: [PATCH 033/149] adding more workflows

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 0d92952da..8299f1623 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,4 +1,4 @@
-name: Test - Full Sweep
+name: Test Sweep
 
 concurrency:
     group: benchmark-lock

From 09e9c4974cc8f11f158d88ec9a57f56f4aace9d3 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 14:45:02 -0500
Subject: [PATCH 034/149] adding more workflows

---
 .github/workflows/test.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8299f1623..2c4e672cb 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -5,7 +5,9 @@ concurrency:
     cancel-in-progress: false
 
 on:
-    pull_request:
+    push:
+        branches:
+            - initial-refactor
     workflow_dispatch:
         inputs:
             name:

From 15553b8dff2e3b851078e0fc7adb5e60f15e4c6d Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 15:09:58 -0500
Subject: [PATCH 035/149] adding more workflows

---
 .github/workflows/test.yml | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 2c4e672cb..01a3bd5fa 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -5,9 +5,6 @@ concurrency:
     cancel-in-progress: false
 
 on:
-    push:
-        branches:
-            - initial-refactor
     workflow_dispatch:
         inputs:
             name:
@@ -101,7 +98,7 @@ jobs:
                 config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
         secrets: inherit
         with:
-            exp-name: "dsr1_1k1k"
+            exp-name: "test"
             isl: ${{ matrix.config.isl }}
             osl: ${{ matrix.config.osl }}
             max-model-len: ${{ matrix.config.max-model-len }}

From 471b7c2be93bf1a8e503e95f3a38d40e44b1fd63 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 15:21:23 -0500
Subject: [PATCH 036/149] adding more workflows

---
 .github/workflows/test.yml                   | 2 +-
 utils/matrix-logic/get_test_sweep_configs.py | 4 ++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 01a3bd5fa..ecc590503 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -98,7 +98,7 @@ jobs:
                 config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
         secrets: inherit
         with:
-            exp-name: "test"
+            exp-name: ${{ matrix.config.model-code }}_test
             isl: ${{ matrix.config.isl }}
             osl: ${{ matrix.config.osl }}
             max-model-len: ${{ matrix.config.max-model-len }}
diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py
index 87ab0457b..8c021cd93 100644
--- a/utils/matrix-logic/get_test_sweep_configs.py
+++ b/utils/matrix-logic/get_test_sweep_configs.py
@@ -72,6 +72,9 @@ def main():
             f"Available keys: {available_keys}"
         )
     
+    # Extract model code (everything before first hyphen)
+    model_code = args.key.split('-')[0]
+    
     val = all_config_data[args.key]
     
     # Validate required fields
@@ -120,6 +123,7 @@ def main():
                 entry = {
                     'image': image,
                     'model': model,
+                    'model-code': model_code,
                     'precision': precision,
                     'framework': framework,
                     'runner': runner,

From fca9c160773c4aed0e0f668682fedfa4ae3c36d1 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 18:01:04 -0500
Subject: [PATCH 037/149] adding more workflows

---
 .github/workflows/test.yml                   |  1 +
 utils/matrix-logic/get_test_sweep_configs.py | 49 ++++++++++++++++----
 2 files changed, 40 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ecc590503..ab70e8ccd 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -84,6 +84,7 @@ jobs:
               run: |
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \
                     --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \
+                    --test-mode \
                     --key ${{ inputs.name }} \
                     ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }})
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py
index 8c021cd93..b4b1366e7 100644
--- a/utils/matrix-logic/get_test_sweep_configs.py
+++ b/utils/matrix-logic/get_test_sweep_configs.py
@@ -37,6 +37,11 @@ def main():
         default=2,
         help='Step size for concurrency values (default: 2)'
     )
+    parser.add_argument(
+        '--test-mode',
+        action='store_true',
+        help='Generate only the lowest concurrency value for each TP level'
+    )
     
     args = parser.parse_args()
     
@@ -117,9 +122,8 @@ def main():
             assert None not in (tp, conc_start, conc_end), \
                 f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'"
             
-            # Generate entries for each concurrency value in the range
-            conc = conc_start
-            while conc <= conc_end:
+            # In test mode, only use the lowest concurrency (conc_start)
+            if args.test_mode:
                 entry = {
                     'image': image,
                     'model': model,
@@ -130,7 +134,7 @@ def main():
                     'isl': isl,
                     'osl': osl,
                     'tp': tp,
-                    'conc': conc,
+                    'conc': conc_start,
                     'max-model-len': isl + osl,
                 }
                 
@@ -141,12 +145,37 @@ def main():
                     entry['dp-attn'] = dp_attn
                 
                 matrix_values.append(entry)
-                
-                if conc == conc_end:
-                    break
-                conc *= args.step_size
-                if conc > conc_end:
-                    conc = conc_end
+            else:
+                # Generate entries for each concurrency value in the range
+                conc = conc_start
+                while conc <= conc_end:
+                    entry = {
+                        'image': image,
+                        'model': model,
+                        'model-code': model_code,
+                        'precision': precision,
+                        'framework': framework,
+                        'runner': runner,
+                        'isl': isl,
+                        'osl': osl,
+                        'tp': tp,
+                        'conc': conc,
+                        'max-model-len': isl + osl,
+                    }
+                    
+                    # Add optional fields if they exist
+                    if ep is not None:
+                        entry['ep'] = ep
+                    if dp_attn is not None:
+                        entry['dp-attn'] = dp_attn
+                    
+                    matrix_values.append(entry)
+                    
+                    if conc == conc_end:
+                        break
+                    conc *= args.step_size
+                    if conc > conc_end:
+                        conc = conc_end
     
     print(json.dumps(matrix_values))
     return matrix_values

From 8ba4de923417f7441135ed5d81e8a97c06f74947 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 08:56:11 -0500
Subject: [PATCH 038/149] adding more workflows

---
 .github/workflows/test.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ab70e8ccd..1aac35921 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -72,6 +72,25 @@ on:
                     - "mi355x-amd_3"
 
 jobs:
+    verify-compatible-runner:
+        runs-on: ubuntu-latest
+        if: ${{ inputs.runner != '' }}
+        steps:
+            - name: Verify runner compatible
+              shell: python
+              run: |
+                import re
+
+                inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', ${{ inputs.name }})
+                if inputs_name_re:
+                    config_gpu = inputs_name_re.group(1)
+                inputs_runner_re = re.match(r'^([^-]+)', ${{ inputs.runner }})
+                if inputs_runner_re:
+                    runner_gpu = inputs_runner_re.group(1)
+
+                assert config_gpu == runner_gpu, f"Specified runner '${{ inputs.runner }})' is not compatible with config '${{ inputs.name }}'""
+
+
     get-jobs:
         runs-on: ubuntu-latest
         outputs:

From 8df3aa3384b609dbd26e6b58f88a91678b15c781 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 08:58:33 -0500
Subject: [PATCH 039/149] adding more workflows

---
 .github/workflows/1k1k-sweep.yml | 2 +-
 .github/workflows/test.yml       | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 58ee3131c..5a7ac4a25 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -5,7 +5,7 @@ concurrency:
     cancel-in-progress: false
 
 on:
-    pull_request:
+    # pull_request:
     workflow_dispatch:
 #   schedule:
 #     - cron: '0 23 * * *'
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1aac35921..4756aae65 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -93,6 +93,8 @@ jobs:
 
     get-jobs:
         runs-on: ubuntu-latest
+        needs: verify-compatible-runner
+        if: ${{ always() && (needs.verify-compatible-runner.result == 'success' || needs.verify-compatible-runner.result == 'skipped') }}
         outputs:
             search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
         steps:

From 903f2f6bcfbc22bce869a4eab7ce2bd5f209df35 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 08:59:14 -0500
Subject: [PATCH 040/149] adding more workflows

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 4756aae65..af764e56e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -88,7 +88,7 @@ jobs:
                 if inputs_runner_re:
                     runner_gpu = inputs_runner_re.group(1)
 
-                assert config_gpu == runner_gpu, f"Specified runner '${{ inputs.runner }})' is not compatible with config '${{ inputs.name }}'""
+                assert config_gpu == runner_gpu, f"Specified runner '${{ inputs.runner }})' is not compatible with config '${{ inputs.name }}'"
 
 
     get-jobs:

From 60465c823960ebba040a8d898d96bf81b742397d Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 09:00:55 -0500
Subject: [PATCH 041/149] adding more workflows

---
 .github/workflows/test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index af764e56e..27e97ef95 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -81,10 +81,10 @@ jobs:
               run: |
                 import re
 
-                inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', ${{ inputs.name }})
+                inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', '${{ inputs.name }}'')
                 if inputs_name_re:
                     config_gpu = inputs_name_re.group(1)
-                inputs_runner_re = re.match(r'^([^-]+)', ${{ inputs.runner }})
+                inputs_runner_re = re.match(r'^([^-]+)', '${{ inputs.runner }}'')
                 if inputs_runner_re:
                     runner_gpu = inputs_runner_re.group(1)
 

From ae2505ed974d6122f38592259116b0ee6670b258 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 09:01:30 -0500
Subject: [PATCH 042/149] adding more workflows

---
 .github/workflows/test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 27e97ef95..a600e1bc5 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -81,10 +81,10 @@ jobs:
               run: |
                 import re
 
-                inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', '${{ inputs.name }}'')
+                inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', '${{ inputs.name }}')
                 if inputs_name_re:
                     config_gpu = inputs_name_re.group(1)
-                inputs_runner_re = re.match(r'^([^-]+)', '${{ inputs.runner }}'')
+                inputs_runner_re = re.match(r'^([^-]+)', '${{ inputs.runner }}')
                 if inputs_runner_re:
                     runner_gpu = inputs_runner_re.group(1)
 

From 6fec99ef2b33a874aaa536de04c7ff6068a57bf8 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 09:41:20 -0500
Subject: [PATCH 043/149] adding more workflows

---
 .github/workflows/1k1k-sweep.yml |  6 ++--
 .github/workflows/1k8k-sweep.yml |  6 ++--
 .github/workflows/8k1k-sweep.yml |  6 ++--
 .github/workflows/test.yml       | 60 +++++++++++---------------------
 4 files changed, 30 insertions(+), 48 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 5a7ac4a25..80bcca43e 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -21,7 +21,7 @@ jobs:
 
             - id: get-70b-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-dsr1-configs:
@@ -34,7 +34,7 @@ jobs:
 
             - id: get-dsr1-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-gptoss-configs:
@@ -47,7 +47,7 @@ jobs:
 
             - id: get-gptoss-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     benchmark-70b:
diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index 5a89e54b2..604e9b9d3 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -24,7 +24,7 @@ jobs:
 
             - id: get-70b-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix 70b)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix 70b)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-dsr1-configs:
@@ -37,7 +37,7 @@ jobs:
 
             - id: get-dsr1-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-gptoss-configs:
@@ -50,7 +50,7 @@ jobs:
 
             - id: get-gptoss-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     benchmark-70b:
diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml
index 9dc28c52b..58c676b56 100644
--- a/.github/workflows/8k1k-sweep.yml
+++ b/.github/workflows/8k1k-sweep.yml
@@ -21,7 +21,7 @@ jobs:
 
             - id: get-70b-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix 70b)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix 70b)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-dsr1-configs:
@@ -34,7 +34,7 @@ jobs:
 
             - id: get-dsr1-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-gptoss-configs:
@@ -47,7 +47,7 @@ jobs:
 
             - id: get-gptoss-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     benchmark-70b:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a600e1bc5..54bed54fe 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -7,24 +7,10 @@ concurrency:
 on:
     workflow_dispatch:
         inputs:
-            name:
-                description: "Name of benchmark from master configs"
+            generate-cli-command:
+                description: "Command passed to generate matrix script"
                 required: true
                 type: string
-                default: 70b-fp4-mi355x-vllm
-
-            run_1k1k:
-                description: "Run ISL/OSL 1k/1k"
-                type: boolean
-                required: true
-            run_1k8k:
-                description: "Run ISL/OSL 1k/8k"
-                type: boolean
-                required: true
-            run_8k1k:
-                description: "Run ISL/OSL 8k/1k"
-                type: boolean
-                required: true
 
             runner:
                 description: "Specific runner node to run on"
@@ -72,29 +58,29 @@ on:
                     - "mi355x-amd_3"
 
 jobs:
-    verify-compatible-runner:
-        runs-on: ubuntu-latest
-        if: ${{ inputs.runner != '' }}
-        steps:
-            - name: Verify runner compatible
-              shell: python
-              run: |
-                import re
+    # verify-compatible-runner:
+    #     runs-on: ubuntu-latest
+    #     if: ${{ inputs.runner != '' }}
+    #     steps:
+    #         - name: Verify runner compatible
+    #           shell: python
+    #           run: |
+    #             import re
 
-                inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', '${{ inputs.name }}')
-                if inputs_name_re:
-                    config_gpu = inputs_name_re.group(1)
-                inputs_runner_re = re.match(r'^([^-]+)', '${{ inputs.runner }}')
-                if inputs_runner_re:
-                    runner_gpu = inputs_runner_re.group(1)
+    #             inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', '${{ inputs.name }}')
+    #             if inputs_name_re:
+    #                 config_gpu = inputs_name_re.group(1)
+    #             inputs_runner_re = re.match(r'^([^-]+)', '${{ inputs.runner }}')
+    #             if inputs_runner_re:
+    #                 runner_gpu = inputs_runner_re.group(1)
 
-                assert config_gpu == runner_gpu, f"Specified runner '${{ inputs.runner }})' is not compatible with config '${{ inputs.name }}'"
+    #             assert config_gpu == runner_gpu, f"Specified runner '${{ inputs.runner }})' is not compatible with config '${{ inputs.name }}'"
 
 
     get-jobs:
         runs-on: ubuntu-latest
-        needs: verify-compatible-runner
-        if: ${{ always() && (needs.verify-compatible-runner.result == 'success' || needs.verify-compatible-runner.result == 'skipped') }}
+        # needs: verify-compatible-runner
+        # if: ${{ always() && (needs.verify-compatible-runner.result == 'success' || needs.verify-compatible-runner.result == 'skipped') }}
         outputs:
             search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
         steps:
@@ -103,17 +89,13 @@ jobs:
 
             - id: get-jobs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \
-                    --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \
-                    --test-mode \
-                    --key ${{ inputs.name }} \
-                    ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }})
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py test-config --test-mode ${{ inputs.generate-cli-command }})
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     test-sweep:
         needs: get-jobs
         uses: ./.github/workflows/benchmark-tmpl.yml
-        name: test sweep - ${{ inputs.name }}
+        name: test ${{ inputs.name }}
         strategy:
             fail-fast: false
             matrix:

From 0226fc558dc412fd19f98de2364b8bfd16d81ef7 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 09:45:59 -0500
Subject: [PATCH 044/149] adding more workflows

---
 utils/matrix-logic/generate_sweep_configs.py | 331 +++++++++++++++++++
 1 file changed, 331 insertions(+)
 create mode 100644 utils/matrix-logic/generate_sweep_configs.py

diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
new file mode 100644
index 000000000..43998fc5b
--- /dev/null
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -0,0 +1,331 @@
+import json
+import yaml
+import argparse
+
+seq_len_stoi = {
+    "1k1k": (1024, 1024),
+    "1k8k": (1024, 8192),
+    "8k1k": (8192, 1024)
+}
+
+def generate_full_sweep(args, all_config_data):
+    """Generate full sweep configurations based on model prefix and sequence lengths."""
+    isl, osl = seq_len_stoi[args.seq_lens]
+
+    matrix_values = []
+    for key, val in all_config_data.items():
+        # Filter by model prefix
+        if not key.startswith(args.model_prefix):
+            continue
+
+        seq_len_configs = val.get('seq-len-configs')
+        assert seq_len_configs, f"Missing 'seq-len-configs' for key '{key}'"
+
+        image = val.get('image')
+        model = val.get('model')
+        precision = val.get('precision')
+        framework = val.get('framework')
+        runner = val.get('runner')
+
+        assert None not in (image, model, precision, framework, runner), \
+            f"Missing required fields for key '{key}'"
+
+        # Check if this config has matching sequence lengths
+        matching_seq_config = None
+        for slq in seq_len_configs:
+            if slq.get('isl') == isl and slq.get('osl') == osl:
+                matching_seq_config = slq
+                break
+
+        if not matching_seq_config:
+            continue  # Skip this config if no matching sequence length
+
+        bmk_space = matching_seq_config.get('bmk-space')
+        assert bmk_space, f"Missing 'bmk-space' in matching seq-len-config for key '{key}'"
+
+        for bmk in bmk_space:
+            tp = bmk.get('tp')
+            conc_start = bmk.get('conc-start')
+            conc_end = bmk.get('conc-end')
+            ep = bmk.get('ep')
+            dp_attn = bmk.get('dp-attn')
+
+            assert None not in (tp, conc_start, conc_end), \
+                f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{key}'"
+
+            # Generate entries for each concurrency value in the range
+            conc = conc_start
+            while conc <= conc_end:
+                entry = {
+                    'image': image,
+                    'model': model,
+                    'precision': precision,
+                    'framework': framework,
+                    'runner': runner,
+                    'isl': isl,
+                    'osl': osl,
+                    'tp': tp,
+                    'conc': conc
+                }
+
+                # Add optional fields if they exist
+                if ep is not None:
+                    entry['ep'] = ep
+                if dp_attn is not None:
+                    entry['dp-attn'] = dp_attn
+
+                matrix_values.append(entry)
+
+                if conc == conc_end:
+                    break
+                conc *= args.step_size
+                if conc > conc_end:
+                    conc = conc_end
+
+    return matrix_values
+
+def generate_test_config(args, all_config_data):
+    """Generate test configurations for a specific key."""
+    # Check if the key exists
+    if args.key not in all_config_data:
+        available_keys = ', '.join(sorted(all_config_data.keys()))
+        raise ValueError(
+            f"Key '{args.key}' not found in configuration files. "
+            f"Available keys: {available_keys}"
+        )
+
+    # Extract model code (everything before first hyphen)
+    model_code = args.key.split('-')[0]
+
+    val = all_config_data[args.key]
+
+    # Validate required fields
+    seq_len_configs = val.get('seq-len-configs')
+    assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'"
+
+    image = val.get('image')
+    model = val.get('model')
+    precision = val.get('precision')
+    framework = val.get('framework')
+    runner = val.get('runner')
+
+    assert None not in (image, model, precision, framework, runner), \
+        f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'"
+
+    # Convert seq-lens to set of (isl, osl) tuples for filtering
+    seq_lens_filter = None
+    if args.seq_lens:
+        seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens}
+
+    matrix_values = []
+
+    # Process each sequence length configuration
+    for seq_config in seq_len_configs:
+        isl = seq_config.get('isl')
+        osl = seq_config.get('osl')
+
+        assert None not in (isl, osl), \
+            f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'"
+
+        # Filter by sequence lengths if specified
+        if seq_lens_filter and (isl, osl) not in seq_lens_filter:
+            continue
+
+        bmk_space = seq_config.get('bmk-space')
+        assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'"
+
+        for bmk in bmk_space:
+            tp = bmk.get('tp')
+            conc_start = bmk.get('conc-start')
+            conc_end = bmk.get('conc-end')
+            ep = bmk.get('ep')
+            dp_attn = bmk.get('dp-attn')
+
+            assert None not in (tp, conc_start, conc_end), \
+                f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'"
+
+            # In test mode, only use the lowest concurrency (conc_start)
+            if args.test_mode:
+                entry = {
+                    'image': image,
+                    'model': model,
+                    'model-code': model_code,
+                    'precision': precision,
+                    'framework': framework,
+                    'runner': runner,
+                    'isl': isl,
+                    'osl': osl,
+                    'tp': tp,
+                    'conc': conc_start,
+                    'max-model-len': isl + osl,
+                }
+
+                # Add optional fields if they exist
+                if ep is not None:
+                    entry['ep'] = ep
+                if dp_attn is not None:
+                    entry['dp-attn'] = dp_attn
+
+                matrix_values.append(entry)
+            else:
+                # Generate entries for each concurrency value in the range
+                conc = conc_start
+                while conc <= conc_end:
+                    entry = {
+                        'image': image,
+                        'model': model,
+                        'model-code': model_code,
+                        'precision': precision,
+                        'framework': framework,
+                        'runner': runner,
+                        'isl': isl,
+                        'osl': osl,
+                        'tp': tp,
+                        'conc': conc,
+                        'max-model-len': isl + osl,
+                    }
+
+                    # Add optional fields if they exist
+                    if ep is not None:
+                        entry['ep'] = ep
+                    if dp_attn is not None:
+                        entry['dp-attn'] = dp_attn
+
+                    matrix_values.append(entry)
+
+                    if conc == conc_end:
+                        break
+                    conc *= args.step_size
+                    if conc > conc_end:
+                        conc = conc_end
+
+    return matrix_values
+
+def load_config_files(config_files):
+    """Load and merge configuration files."""
+    all_config_data = {}
+    for config_file in config_files:
+        try:
+            with open(config_file, 'r') as f:
+                config_data = yaml.safe_load(f)
+                assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary"
+
+                # Check for duplicate keys
+                duplicate_keys = set(all_config_data.keys()) & set(config_data.keys())
+                if duplicate_keys:
+                    raise ValueError(
+                        f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
+                    )
+
+                all_config_data.update(config_data)
+        except FileNotFoundError:
+            raise ValueError(f"Input file '{config_file}' does not exist.")
+
+    return all_config_data
+
+def main():
+    # Create parent parser with common arguments
+    parent_parser = argparse.ArgumentParser(add_help=False)
+    parent_parser.add_argument(
+        '--config-files',
+        nargs='+',
+        required=True,
+        help='One or more configuration files (YAML format)'
+    )
+
+    # Create main parser
+    parser = argparse.ArgumentParser(
+        description='Generate benchmark configurations from YAML config files'
+    )
+
+    # Create subparsers for subcommands
+    subparsers = parser.add_subparsers(
+        dest='command',
+        required=True,
+        help='Available commands'
+    )
+
+    # Subcommand: full-sweep
+    full_sweep_parser = subparsers.add_parser(
+        'full-sweep',
+        parents=[parent_parser],
+        add_help=False,
+        help='Generate full sweep configurations based on model prefix'
+    )
+    full_sweep_parser.add_argument(
+        '--seq-lens',
+        choices=list(seq_len_stoi.keys()),
+        required=True,
+        help=f"Sequence length configuration: {', '.join(seq_len_stoi.keys())}"
+    )
+    full_sweep_parser.add_argument(
+        '--model-prefix',
+        required=True,
+        help='Model prefix to filter configurations'
+    )
+    full_sweep_parser.add_argument(
+        '--step-size',
+        type=int,
+        default=2,
+        help='Step size for concurrency values (default: 2)'
+    )
+    full_sweep_parser.add_argument(
+        '-h', '--help',
+        action='help',
+        help='Show this help message and exit'
+    )
+
+    # Subcommand: test-config
+    test_config_parser = subparsers.add_parser(
+        'test-config',
+        parents=[parent_parser],
+        add_help=False,
+        help='Generate test configurations for a specific key'
+    )
+    test_config_parser.add_argument(
+        '--key',
+        required=True,
+        help='Configuration key to use'
+    )
+    test_config_parser.add_argument(
+        '--seq-lens',
+        nargs='+',
+        choices=list(seq_len_stoi.keys()),
+        required=False,
+        help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
+    )
+    test_config_parser.add_argument(
+        '--step-size',
+        type=int,
+        default=2,
+        help='Step size for concurrency values (default: 2)'
+    )
+    test_config_parser.add_argument(
+        '--test-mode',
+        action='store_true',
+        help='Generate only the lowest concurrency value for each TP level'
+    )
+    test_config_parser.add_argument(
+        '-h', '--help',
+        action='help',
+        help='Show this help message and exit'
+    )
+
+    args = parser.parse_args()
+
+    # Load configuration files
+    all_config_data = load_config_files(args.config_files)
+
+    # Route to appropriate function based on subcommand
+    if args.command == 'full-sweep':
+        matrix_values = generate_full_sweep(args, all_config_data)
+    elif args.command == 'test-config':
+        matrix_values = generate_test_config(args, all_config_data)
+    else:
+        parser.error(f"Unknown command: {args.command}")
+
+    print(json.dumps(matrix_values))
+    return matrix_values
+
+if __name__ == "__main__":
+    main()

From 3f7609d66fd1eb16e2c4f5f7ce9f59b3a7c80f16 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 10:11:19 -0500
Subject: [PATCH 045/149] adding more workflows

---
 utils/matrix-logic/generate_sweep_configs.py | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 43998fc5b..0d835bf0d 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -94,9 +94,16 @@ def generate_test_config(args, all_config_data):
             f"Available keys: {available_keys}"
         )
 
-    # Extract model code (everything before first hyphen)
+    # Extract model code from config key
     model_code = args.key.split('-')[0]
-
+    # Extract GPU from config key
+    config_gpu = args.key.split('-')[2]
+    runner_gpu = args.runner_node.split('-')[0] if args.runner_node else None
+    
+    # If user enters a runner not compatible with input GPU sku, error
+    if runner_gpu and config_gpu != runner_gpu:
+        raise ValueError(f"GPU '{config_gpu}' used in selected config '{args.key}' cannot run on selected runner node '{args.runner_node}'.")
+    
     val = all_config_data[args.key]
 
     # Validate required fields
@@ -107,7 +114,8 @@ def generate_test_config(args, all_config_data):
     model = val.get('model')
     precision = val.get('precision')
     framework = val.get('framework')
-    runner = val.get('runner')
+    # Use default runner or specific runner node if input by user
+    runner = val.get('runner') if not args.runner_node else args.runner_node
 
     assert None not in (image, model, precision, framework, runner), \
         f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'"
@@ -287,6 +295,11 @@ def main():
         required=True,
         help='Configuration key to use'
     )
+    test_config_parser.add_argument(
+        '--runner-node',
+        required=False,
+        help='Specific runner node to use'
+    )
     test_config_parser.add_argument(
         '--seq-lens',
         nargs='+',

From 395bbb067465fe460ce98c1da63ae77f2cbe2776 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 10:20:25 -0500
Subject: [PATCH 046/149] adding more workflows

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 54bed54fe..4cd314ef0 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -95,7 +95,7 @@ jobs:
     test-sweep:
         needs: get-jobs
         uses: ./.github/workflows/benchmark-tmpl.yml
-        name: test ${{ inputs.name }}
+        name: ${{ inputs.generate-cli-command }}
         strategy:
             fail-fast: false
             matrix:

From 8510c0aeadd8935a2b8f4abf5afdc249af472f7a Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 10:28:51 -0500
Subject: [PATCH 047/149] adding more workflows

---
 .github/workflows/test.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 4cd314ef0..a21b118f1 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,8 +1,8 @@
 name: Test Sweep
 
-concurrency:
-    group: benchmark-lock
-    cancel-in-progress: false
+# concurrency:
+#     group: benchmark-lock
+#     cancel-in-progress: false
 
 on:
     workflow_dispatch:
@@ -102,7 +102,7 @@ jobs:
                 config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
         secrets: inherit
         with:
-            exp-name: ${{ matrix.config.model-code }}_test
+            exp-name: ${{ matrix.config.model-code }}_test_${{ matrix.config.isl }}_${{ matrix.config.osl }}
             isl: ${{ matrix.config.isl }}
             osl: ${{ matrix.config.osl }}
             max-model-len: ${{ matrix.config.max-model-len }}

From f439163edb9ddf293f23b1bd430d47e7d6cc2d59 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 15:29:03 -0500
Subject: [PATCH 048/149] adding more workflows

---
 .github/configs/runners.yaml                 |  48 ++++
 .github/workflows/test.yml                   |  70 +----
 utils/matrix-logic/generate_sweep_configs.py | 266 ++++++++++++++-----
 3 files changed, 250 insertions(+), 134 deletions(-)
 create mode 100644 .github/configs/runners.yaml

diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
new file mode 100644
index 000000000..692cf74ad
--- /dev/null
+++ b/.github/configs/runners.yaml
@@ -0,0 +1,48 @@
+h100:
+- 'h100-cr_0'
+- 'h100-cr_1'
+- 'h100-cw_0'
+- 'h100-cw_1'
+h200:
+- 'h200-cw_0'
+- 'h200-cw_1'
+- 'h200-nb_0'
+- 'h200-nb_1'
+- 'h200-nb_2'
+- 'h200-nb_3'
+- 'h200-nv_0'
+- 'h200-nv_1'
+- 'h200-nv_2'
+- 'h200-nv_3'
+b200-trt:
+- 'b200-nv_0'
+- 'b200-nv_1'
+b200:
+- 'b200-nb_0'
+- 'b200-nb_1'
+- 'b200-nvd_0'
+- 'b200-nvd_1'
+- 'b200-nvd_2'
+- 'b200-nvd_3'
+- 'b200-tg_0'
+mi300x:
+- 'mi300x-amd_0'
+- 'mi300x-amd_1'
+- 'mi300x-amd_2'
+- 'mi300x-amd_3'
+- 'mi300x-amd_4'
+- 'mi300x-cr_0'
+- 'mi300x-oci_0'
+mi325x:
+- 'mi325x-amd_0'
+- 'mi325x-tw_0'
+- 'mi325x-tw_1'
+- 'mi325x-tw_2'
+- 'mi325x-tw_3'
+mi355x:
+- 'mi355x-amd_0'
+- 'mi355x-amd_1'
+- 'mi355x-amd_2'
+- 'mi355x-amd_3'
+gb200:
+- gb200-nv_0
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index a21b118f1..e56fc9a82 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -12,75 +12,9 @@ on:
                 required: true
                 type: string
 
-            runner:
-                description: "Specific runner node to run on"
-                required: false
-                type: choice
-                options:
-                    - "h100-cr_0"
-                    - "h100-cr_1"
-                    - "h100-cw_0"
-                    - "h100-cw_1"
-                    - "h200-cw_0"
-                    - "h200-cw_1"
-                    - "h200-nb_0"
-                    - "h200-nb_1"
-                    - "h200-nb_2"
-                    - "h200-nb_3"
-                    - "h200-nv_0"
-                    - "h200-nv_1"
-                    - "h200-nv_2"
-                    - "h200-nv_3"
-                    - "b200-nv_0"
-                    - "b200-nv_1"
-                    - "b200-nb_0"
-                    - "b200-nb_1"
-                    - "b200-nvd_0"
-                    - "b200-nvd_1"
-                    - "b200-nvd_2"
-                    - "b200-nvd_3"
-                    - "b200-tg_0"
-                    - "mi300x-amd_0"
-                    - "mi300x-amd_1"
-                    - "mi300x-amd_2"
-                    - "mi300x-amd_3"
-                    - "mi300x-amd_4"
-                    - "mi300x-cr_0"
-                    - "mi300x-oci_0"
-                    - "mi325x-amd_0"
-                    - "mi325x-tw_0"
-                    - "mi325x-tw_1"
-                    - "mi325x-tw_2"
-                    - "mi325x-tw_3"
-                    - "mi355x-amd_0"
-                    - "mi355x-amd_1"
-                    - "mi355x-amd_2"
-                    - "mi355x-amd_3"
-
 jobs:
-    # verify-compatible-runner:
-    #     runs-on: ubuntu-latest
-    #     if: ${{ inputs.runner != '' }}
-    #     steps:
-    #         - name: Verify runner compatible
-    #           shell: python
-    #           run: |
-    #             import re
-
-    #             inputs_name_re = re.match(r'^[^-]+-[^-]+-([^-]+)', '${{ inputs.name }}')
-    #             if inputs_name_re:
-    #                 config_gpu = inputs_name_re.group(1)
-    #             inputs_runner_re = re.match(r'^([^-]+)', '${{ inputs.runner }}')
-    #             if inputs_runner_re:
-    #                 runner_gpu = inputs_runner_re.group(1)
-
-    #             assert config_gpu == runner_gpu, f"Specified runner '${{ inputs.runner }})' is not compatible with config '${{ inputs.name }}'"
-
-
     get-jobs:
         runs-on: ubuntu-latest
-        # needs: verify-compatible-runner
-        # if: ${{ always() && (needs.verify-compatible-runner.result == 'success' || needs.verify-compatible-runner.result == 'skipped') }}
         outputs:
             search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
         steps:
@@ -89,7 +23,7 @@ jobs:
 
             - id: get-jobs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py test-config --test-mode ${{ inputs.generate-cli-command }})
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py ${{ inputs.generate-cli-command }})
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     test-sweep:
@@ -106,7 +40,7 @@ jobs:
             isl: ${{ matrix.config.isl }}
             osl: ${{ matrix.config.osl }}
             max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }}
+            runner: ${{ matrix.config.runner }}
             image: ${{ matrix.config.image }}
             model: ${{ matrix.config.model }}
             framework: ${{ matrix.config.framework }}
diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 0d835bf0d..408a7e353 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -8,8 +8,82 @@
     "8k1k": (8192, 1024)
 }
 
+def validate_master_configs_structure(all_config_data):
+    """Validate the structure of all master config entries.
+
+    This validates that all required fields are present, have correct types,
+    and no extra fields exist. Should be called once after loading config files.
+    """
+    for key, val in all_config_data.items():
+        # Check for required top-level fields and their types
+        required_fields = {
+            'image': str,
+            'model': str,
+            'precision': str,
+            'framework': str,
+            'runner': str,
+            'seq-len-configs': list
+        }
+
+        for field, expected_type in required_fields.items():
+            if field not in val or val[field] is None:
+                raise ValueError(f"Missing required field '{field}' for key '{key}'")
+            if not isinstance(val[field], expected_type):
+                raise ValueError(f"Field '{field}' must be {expected_type.__name__} for key '{key}', got {type(val[field]).__name__}")
+
+        seq_len_configs = val['seq-len-configs']
+        if len(seq_len_configs) == 0:
+            raise ValueError(f"'seq-len-configs' must be a non-empty list for key '{key}'")
+
+        # Validate each seq-len-config
+        for i, seq_config in enumerate(seq_len_configs):
+            # Check isl
+            if 'isl' not in seq_config or seq_config['isl'] is None:
+                raise ValueError(f"Missing 'isl' in seq-len-config[{i}] for key '{key}'")
+            if not isinstance(seq_config['isl'], int):
+                raise ValueError(f"'isl' must be int in seq-len-config[{i}] for key '{key}'")
+
+            # Check osl
+            if 'osl' not in seq_config or seq_config['osl'] is None:
+                raise ValueError(f"Missing 'osl' in seq-len-config[{i}] for key '{key}'")
+            if not isinstance(seq_config['osl'], int):
+                raise ValueError(f"'osl' must be int in seq-len-config[{i}] for key '{key}'")
+
+            bmk_space = seq_config.get('bmk-space')
+            if not bmk_space or not isinstance(bmk_space, list) or len(bmk_space) == 0:
+                raise ValueError(f"Missing or invalid 'bmk-space' in seq-len-config[{i}] for key '{key}'")
+
+            # Validate each benchmark in bmk-space
+            for j, bmk in enumerate(bmk_space):
+                # Define allowed fields
+                allowed_fields = {'tp', 'conc-start', 'conc-end', 'ep', 'dp-attn'}
+                required_bmk_fields = {'tp': int, 'conc-start': int, 'conc-end': int}
+                optional_bmk_fields = {'ep': int, 'dp-attn': bool}
+
+                # Check for extra fields
+                extra_fields = set(bmk.keys()) - allowed_fields
+                if extra_fields:
+                    raise ValueError(f"Extra fields {extra_fields} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
+
+                # Validate required fields
+                for field, expected_type in required_bmk_fields.items():
+                    if field not in bmk or bmk[field] is None:
+                        raise ValueError(f"Missing '{field}' in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
+                    if not isinstance(bmk[field], expected_type):
+                        raise ValueError(f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
+
+                # Validate optional fields if they exist
+                for field, expected_type in optional_bmk_fields.items():
+                    if field in bmk and bmk[field] is not None:
+                        if not isinstance(bmk[field], expected_type):
+                            raise ValueError(f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
+
+
 def generate_full_sweep(args, all_config_data):
-    """Generate full sweep configurations based on model prefix and sequence lengths."""
+    """Generate full sweep configurations based on model prefix and sequence lengths.
+
+    Assumes all_config_data has been validated by validate_config_structure().
+    """
     isl, osl = seq_len_stoi[args.seq_lens]
 
     matrix_values = []
@@ -18,41 +92,32 @@ def generate_full_sweep(args, all_config_data):
         if not key.startswith(args.model_prefix):
             continue
 
-        seq_len_configs = val.get('seq-len-configs')
-        assert seq_len_configs, f"Missing 'seq-len-configs' for key '{key}'"
-
-        image = val.get('image')
-        model = val.get('model')
-        precision = val.get('precision')
-        framework = val.get('framework')
-        runner = val.get('runner')
-
-        assert None not in (image, model, precision, framework, runner), \
-            f"Missing required fields for key '{key}'"
+        seq_len_configs = val['seq-len-configs']
+        image = val['image']
+        model = val['model']
+        precision = val['precision']
+        framework = val['framework']
+        runner = val['runner']
 
         # Check if this config has matching sequence lengths
         matching_seq_config = None
         for slq in seq_len_configs:
-            if slq.get('isl') == isl and slq.get('osl') == osl:
+            if slq['isl'] == isl and slq['osl'] == osl:
                 matching_seq_config = slq
                 break
 
         if not matching_seq_config:
             continue  # Skip this config if no matching sequence length
 
-        bmk_space = matching_seq_config.get('bmk-space')
-        assert bmk_space, f"Missing 'bmk-space' in matching seq-len-config for key '{key}'"
+        bmk_space = matching_seq_config['bmk-space']
 
         for bmk in bmk_space:
-            tp = bmk.get('tp')
-            conc_start = bmk.get('conc-start')
-            conc_end = bmk.get('conc-end')
+            tp = bmk['tp']
+            conc_start = bmk['conc-start']
+            conc_end = bmk['conc-end']
             ep = bmk.get('ep')
             dp_attn = bmk.get('dp-attn')
 
-            assert None not in (tp, conc_start, conc_end), \
-                f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{key}'"
-
             # Generate entries for each concurrency value in the range
             conc = conc_start
             while conc <= conc_end:
@@ -84,41 +149,24 @@ def generate_full_sweep(args, all_config_data):
 
     return matrix_values
 
+
 def generate_test_config(args, all_config_data):
-    """Generate test configurations for a specific key."""
-    # Check if the key exists
-    if args.key not in all_config_data:
-        available_keys = ', '.join(sorted(all_config_data.keys()))
-        raise ValueError(
-            f"Key '{args.key}' not found in configuration files. "
-            f"Available keys: {available_keys}"
-        )
+    """Generate test configurations for a specific key.
 
+    Assumes all_config_data has been validated by validate_config_structure().
+    """
     # Extract model code from config key
     model_code = args.key.split('-')[0]
-    # Extract GPU from config key
-    config_gpu = args.key.split('-')[2]
-    runner_gpu = args.runner_node.split('-')[0] if args.runner_node else None
-    
-    # If user enters a runner not compatible with input GPU sku, error
-    if runner_gpu and config_gpu != runner_gpu:
-        raise ValueError(f"GPU '{config_gpu}' used in selected config '{args.key}' cannot run on selected runner node '{args.runner_node}'.")
-    
-    val = all_config_data[args.key]
 
-    # Validate required fields
-    seq_len_configs = val.get('seq-len-configs')
-    assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'"
+    val = all_config_data[args.key]
 
-    image = val.get('image')
-    model = val.get('model')
-    precision = val.get('precision')
-    framework = val.get('framework')
+    seq_len_configs = val['seq-len-configs']
+    image = val['image']
+    model = val['model']
+    precision = val['precision']
+    framework = val['framework']
     # Use default runner or specific runner node if input by user
-    runner = val.get('runner') if not args.runner_node else args.runner_node
-
-    assert None not in (image, model, precision, framework, runner), \
-        f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'"
+    runner = val['runner'] if not args.runner_node else args.runner_node
 
     # Convert seq-lens to set of (isl, osl) tuples for filtering
     seq_lens_filter = None
@@ -129,29 +177,22 @@ def generate_test_config(args, all_config_data):
 
     # Process each sequence length configuration
     for seq_config in seq_len_configs:
-        isl = seq_config.get('isl')
-        osl = seq_config.get('osl')
-
-        assert None not in (isl, osl), \
-            f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'"
+        isl = seq_config['isl']
+        osl = seq_config['osl']
 
         # Filter by sequence lengths if specified
         if seq_lens_filter and (isl, osl) not in seq_lens_filter:
             continue
 
-        bmk_space = seq_config.get('bmk-space')
-        assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'"
+        bmk_space = seq_config['bmk-space']
 
         for bmk in bmk_space:
-            tp = bmk.get('tp')
-            conc_start = bmk.get('conc-start')
-            conc_end = bmk.get('conc-end')
+            tp = bmk['tp']
+            conc_start = bmk['conc-start']
+            conc_end = bmk['conc-end']
             ep = bmk.get('ep')
             dp_attn = bmk.get('dp-attn')
 
-            assert None not in (tp, conc_start, conc_end), \
-                f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'"
-
             # In test mode, only use the lowest concurrency (conc_start)
             if args.test_mode:
                 entry = {
@@ -209,6 +250,68 @@ def generate_test_config(args, all_config_data):
 
     return matrix_values
 
+
+def generate_runner_model_sweep_config(args, all_config_data):
+    """Generate runner-model sweep configurations.
+
+    Assumes all_config_data has been validated by validate_config_structure().
+    """
+    with open(args.runner_config, 'r') as f:
+        runner_config = yaml.safe_load(f)
+
+    runner_nodes = runner_config.get(args.runner_type)
+    
+    if not runner_nodes:
+        raise ValueError(f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.")
+
+    matrix_values = []
+    for key, val in all_config_data.items():
+        # Only consider configs with specified runner
+        if val['runner'] != args.runner_type:
+            continue
+
+        # Find 1k1k config
+        target_config = None
+        for config in val['seq-len-configs']:
+            if config['isl'] == 1024 and config['osl'] == 1024:
+                target_config = config
+                break
+
+        highest_tp_bmk = max(target_config['bmk-space'], key=lambda x: x['tp'])
+        # Since we are just testing, pick the highest TP for this config and just test
+        # on that TP with the lowest concurrency available
+        highest_tp = highest_tp_bmk['tp']
+        lowest_conc = highest_tp_bmk['conc-start']
+
+        ep = highest_tp_bmk.get('ep')
+        dp_attn = highest_tp_bmk.get('dp-attn')
+
+        for node in runner_nodes:
+            entry = {
+                'image': val['image'],
+                'model': val['model'],
+                'precision': val['precision'],
+                'framework': val['framework'],
+                # Add one entry for each node under specified runner type
+                'runner': node,
+                # Again, just use 1k1k since this is just meant to smoke test all runners
+                'isl': 1024,
+                'osl': 1024,
+                'tp': highest_tp,
+                'conc': lowest_conc
+            }
+
+            # Add optional fields if they exist
+            if ep is not None:
+                entry['ep'] = ep
+            if dp_attn is not None:
+                entry['dp-attn'] = dp_attn
+
+            matrix_values.append(entry)
+
+    return matrix_values
+
+
 def load_config_files(config_files):
     """Load and merge configuration files."""
     all_config_data = {}
@@ -216,10 +319,13 @@ def load_config_files(config_files):
         try:
             with open(config_file, 'r') as f:
                 config_data = yaml.safe_load(f)
-                assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary"
+                assert isinstance(
+                    config_data, dict), f"Config file '{config_file}' must contain a dictionary"
 
-                # Check for duplicate keys
-                duplicate_keys = set(all_config_data.keys()) & set(config_data.keys())
+                # Check for duplicate keys, this is only in place to prevent against the very unlikely
+                # case where an entry in one config accidentally/purposefully tries to override an entry in another config
+                duplicate_keys = set(all_config_data.keys()) & set(
+                    config_data.keys())
                 if duplicate_keys:
                     raise ValueError(
                         f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
@@ -231,6 +337,7 @@ def load_config_files(config_files):
 
     return all_config_data
 
+
 def main():
     # Create parent parser with common arguments
     parent_parser = argparse.ArgumentParser(add_help=False)
@@ -324,21 +431,48 @@ def main():
         help='Show this help message and exit'
     )
 
+    # Subcommand: runner-model-sweep
+    test_config_parser = subparsers.add_parser(
+        'runner-model-sweep',
+        parents=[parent_parser],
+        add_help=False,
+        help='Sweep across all runner nodes and all compatible models for a given runner'
+    )
+    test_config_parser.add_argument(
+        '--runner-type',
+        required=True,
+        help='Runner type (e.g., h200-trt, h100)'
+    )
+    test_config_parser.add_argument(
+        '--runner-config',
+        required=True,
+        help='Configuration file holding runner information'
+    )
+    test_config_parser.add_argument(
+        '-h', '--help',
+        action='help',
+        help='Show this help message and exit'
+    )
+
     args = parser.parse_args()
 
-    # Load configuration files
+    # Load and validate configuration files
     all_config_data = load_config_files(args.config_files)
+    validate_master_configs_structure(all_config_data)
 
     # Route to appropriate function based on subcommand
     if args.command == 'full-sweep':
         matrix_values = generate_full_sweep(args, all_config_data)
     elif args.command == 'test-config':
         matrix_values = generate_test_config(args, all_config_data)
+    elif args.command == 'runner-model-sweep':
+        matrix_values = generate_runner_model_sweep_config(args, all_config_data)
     else:
         parser.error(f"Unknown command: {args.command}")
 
     print(json.dumps(matrix_values))
     return matrix_values
 
+
 if __name__ == "__main__":
     main()

From 28665f238ac5668563ee79c6e36b182ac3b7b822 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 15:48:31 -0500
Subject: [PATCH 049/149] adding more workflows

---
 utils/matrix-logic/generate_sweep_configs.py | 57 +++++++++++++-------
 1 file changed, 39 insertions(+), 18 deletions(-)

diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 408a7e353..a0d5676cb 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -8,6 +8,7 @@
     "8k1k": (8192, 1024)
 }
 
+
 def validate_master_configs_structure(all_config_data):
     """Validate the structure of all master config entries.
 
@@ -27,56 +28,70 @@ def validate_master_configs_structure(all_config_data):
 
         for field, expected_type in required_fields.items():
             if field not in val or val[field] is None:
-                raise ValueError(f"Missing required field '{field}' for key '{key}'")
+                raise ValueError(
+                    f"Missing required field '{field}' for key '{key}'")
             if not isinstance(val[field], expected_type):
-                raise ValueError(f"Field '{field}' must be {expected_type.__name__} for key '{key}', got {type(val[field]).__name__}")
+                raise ValueError(
+                    f"Field '{field}' must be {expected_type.__name__} for key '{key}', got {type(val[field]).__name__}")
 
         seq_len_configs = val['seq-len-configs']
         if len(seq_len_configs) == 0:
-            raise ValueError(f"'seq-len-configs' must be a non-empty list for key '{key}'")
+            raise ValueError(
+                f"'seq-len-configs' must be a non-empty list for key '{key}'")
 
         # Validate each seq-len-config
         for i, seq_config in enumerate(seq_len_configs):
             # Check isl
             if 'isl' not in seq_config or seq_config['isl'] is None:
-                raise ValueError(f"Missing 'isl' in seq-len-config[{i}] for key '{key}'")
+                raise ValueError(
+                    f"Missing 'isl' in seq-len-config[{i}] for key '{key}'")
             if not isinstance(seq_config['isl'], int):
-                raise ValueError(f"'isl' must be int in seq-len-config[{i}] for key '{key}'")
+                raise ValueError(
+                    f"'isl' must be int in seq-len-config[{i}] for key '{key}'")
 
             # Check osl
             if 'osl' not in seq_config or seq_config['osl'] is None:
-                raise ValueError(f"Missing 'osl' in seq-len-config[{i}] for key '{key}'")
+                raise ValueError(
+                    f"Missing 'osl' in seq-len-config[{i}] for key '{key}'")
             if not isinstance(seq_config['osl'], int):
-                raise ValueError(f"'osl' must be int in seq-len-config[{i}] for key '{key}'")
+                raise ValueError(
+                    f"'osl' must be int in seq-len-config[{i}] for key '{key}'")
 
             bmk_space = seq_config.get('bmk-space')
             if not bmk_space or not isinstance(bmk_space, list) or len(bmk_space) == 0:
-                raise ValueError(f"Missing or invalid 'bmk-space' in seq-len-config[{i}] for key '{key}'")
+                raise ValueError(
+                    f"Missing or invalid 'bmk-space' in seq-len-config[{i}] for key '{key}'")
 
             # Validate each benchmark in bmk-space
             for j, bmk in enumerate(bmk_space):
                 # Define allowed fields
-                allowed_fields = {'tp', 'conc-start', 'conc-end', 'ep', 'dp-attn'}
-                required_bmk_fields = {'tp': int, 'conc-start': int, 'conc-end': int}
+                allowed_fields = {'tp', 'conc-start',
+                                  'conc-end', 'ep', 'dp-attn'}
+                required_bmk_fields = {'tp': int,
+                                       'conc-start': int, 'conc-end': int}
                 optional_bmk_fields = {'ep': int, 'dp-attn': bool}
 
                 # Check for extra fields
                 extra_fields = set(bmk.keys()) - allowed_fields
                 if extra_fields:
-                    raise ValueError(f"Extra fields {extra_fields} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
+                    raise ValueError(
+                        f"Extra fields {extra_fields} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
 
                 # Validate required fields
                 for field, expected_type in required_bmk_fields.items():
                     if field not in bmk or bmk[field] is None:
-                        raise ValueError(f"Missing '{field}' in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
+                        raise ValueError(
+                            f"Missing '{field}' in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
                     if not isinstance(bmk[field], expected_type):
-                        raise ValueError(f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
+                        raise ValueError(
+                            f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
 
                 # Validate optional fields if they exist
                 for field, expected_type in optional_bmk_fields.items():
                     if field in bmk and bmk[field] is not None:
                         if not isinstance(bmk[field], expected_type):
-                            raise ValueError(f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
+                            raise ValueError(
+                                f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
 
 
 def generate_full_sweep(args, all_config_data):
@@ -98,6 +113,9 @@ def generate_full_sweep(args, all_config_data):
         precision = val['precision']
         framework = val['framework']
         runner = val['runner']
+        # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name
+        # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME
+        model_code = key.split('-')[0]
 
         # Check if this config has matching sequence lengths
         matching_seq_config = None
@@ -130,7 +148,8 @@ def generate_full_sweep(args, all_config_data):
                     'isl': isl,
                     'osl': osl,
                     'tp': tp,
-                    'conc': conc
+                    'conc': conc,
+                    'model_code': model_code,
                 }
 
                 # Add optional fields if they exist
@@ -260,9 +279,10 @@ def generate_runner_model_sweep_config(args, all_config_data):
         runner_config = yaml.safe_load(f)
 
     runner_nodes = runner_config.get(args.runner_type)
-    
+
     if not runner_nodes:
-        raise ValueError(f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.")
+        raise ValueError(
+            f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.")
 
     matrix_values = []
     for key, val in all_config_data.items():
@@ -466,7 +486,8 @@ def main():
     elif args.command == 'test-config':
         matrix_values = generate_test_config(args, all_config_data)
     elif args.command == 'runner-model-sweep':
-        matrix_values = generate_runner_model_sweep_config(args, all_config_data)
+        matrix_values = generate_runner_model_sweep_config(
+            args, all_config_data)
     else:
         parser.error(f"Unknown command: {args.command}")
 

From 99aec702d2305cdda8ca7e9ed8b88c5b3bd5ffc1 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 15:51:43 -0500
Subject: [PATCH 050/149] adding more workflows

---
 .github/workflows/1k8k-sweep.yml             | 13 +++++--------
 utils/matrix-logic/generate_sweep_configs.py |  7 ++++++-
 2 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index 604e9b9d3..cced99997 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -4,14 +4,11 @@ concurrency:
   group: benchmark-lock-1k8k
   cancel-in-progress: false
 
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 23 * * *'
-
 on:
     # pull_request:
     workflow_dispatch:
+#   schedule:
+#     - cron: '0 23 * * *'
 
 jobs:
     get-70b-configs:
@@ -24,7 +21,7 @@ jobs:
 
             - id: get-70b-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix 70b)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix 70b)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-dsr1-configs:
@@ -37,7 +34,7 @@ jobs:
 
             - id: get-dsr1-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-gptoss-configs:
@@ -50,7 +47,7 @@ jobs:
 
             - id: get-gptoss-configs
               run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     benchmark-70b:
diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index a0d5676cb..8f11d79df 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -289,6 +289,10 @@ def generate_runner_model_sweep_config(args, all_config_data):
         # Only consider configs with specified runner
         if val['runner'] != args.runner_type:
             continue
+        
+        # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name
+        # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME
+        model_code = key.split('-')[0]
 
         # Find 1k1k config
         target_config = None
@@ -318,7 +322,8 @@ def generate_runner_model_sweep_config(args, all_config_data):
                 'isl': 1024,
                 'osl': 1024,
                 'tp': highest_tp,
-                'conc': lowest_conc
+                'conc': lowest_conc,
+                'model-code': model_code,
             }
 
             # Add optional fields if they exist

From 3ea4aa2731bbd23b89848c28c46607f6b4ad7b8f Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Tue, 28 Oct 2025 15:56:53 -0500
Subject: [PATCH 051/149] adding more workflows

---
 utils/matrix-logic/generate_sweep_configs.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 8f11d79df..6dfdc2bf9 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -149,7 +149,8 @@ def generate_full_sweep(args, all_config_data):
                     'osl': osl,
                     'tp': tp,
                     'conc': conc,
-                    'model_code': model_code,
+                    'model-code': model_code,
+                    'max-model-len': isl + osl,
                 }
 
                 # Add optional fields if they exist
@@ -324,6 +325,7 @@ def generate_runner_model_sweep_config(args, all_config_data):
                 'tp': highest_tp,
                 'conc': lowest_conc,
                 'model-code': model_code,
+                'max-model-len': 2048,
             }
 
             # Add optional fields if they exist

From 60906564838628da9b560560e41d5b825404b022 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 08:10:39 -0500
Subject: [PATCH 052/149] adding more workflows

---
 .github/configs/runners.yaml                 |  14 ++
 utils/matrix-logic/generate_sweep_configs.py | 128 ++++++++++++++++++-
 2 files changed, 139 insertions(+), 3 deletions(-)

diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 692cf74ad..692ade8dd 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -14,9 +14,23 @@ h200:
 - 'h200-nv_1'
 - 'h200-nv_2'
 - 'h200-nv_3'
+h200-trt:
+- 'h200-cw_0'
+- 'h200-cw_1'
+- 'h200-nb_0'
+- 'h200-nb_1'
+- 'h200-nb_2'
+- 'h200-nb_3'
+- 'h200-nv_0'
+- 'h200-nv_1'
+- 'h200-nv_2'
+- 'h200-nv_3'
 b200-trt:
 - 'b200-nv_0'
 - 'b200-nv_1'
+b200-nvs:
+- 'b200-nv_0'
+- 'b200-nv_1'
 b200:
 - 'b200-nb_0'
 - 'b200-nb_1'
diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 6dfdc2bf9..10dc07c82 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -276,8 +276,11 @@ def generate_runner_model_sweep_config(args, all_config_data):
 
     Assumes all_config_data has been validated by validate_config_structure().
     """
-    with open(args.runner_config, 'r') as f:
-        runner_config = yaml.safe_load(f)
+    try:
+        with open(args.runner_config, 'r') as f:
+            runner_config = yaml.safe_load(f)
+    except FileNotFoundError as e:
+        raise ValueError(f"Runner config file '{args.runner_config}' does not exist.")
 
     runner_nodes = runner_config.get(args.runner_type)
 
@@ -290,10 +293,85 @@ def generate_runner_model_sweep_config(args, all_config_data):
         # Only consider configs with specified runner
         if val['runner'] != args.runner_type:
             continue
-        
+
+        # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name
+        # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME
+        model_code = key.split('-')[0]
+
+        # Find 1k1k config
+        target_config = None
+        for config in val['seq-len-configs']:
+            if config['isl'] == 1024 and config['osl'] == 1024:
+                target_config = config
+                break
+
+        highest_tp_bmk = max(target_config['bmk-space'], key=lambda x: x['tp'])
+        # Since we are just testing, pick the highest TP for this config and just test
+        # on that TP with the lowest concurrency available
+        highest_tp = highest_tp_bmk['tp']
+        lowest_conc = highest_tp_bmk['conc-start']
+
+        ep = highest_tp_bmk.get('ep')
+        dp_attn = highest_tp_bmk.get('dp-attn')
+
+        for node in runner_nodes:
+            entry = {
+                'image': val['image'],
+                'model': val['model'],
+                'precision': val['precision'],
+                'framework': val['framework'],
+                # Add one entry for each node under specified runner type
+                'runner': node,
+                # Again, just use 1k1k since this is just meant to smoke test all runners
+                'isl': 1024,
+                'osl': 1024,
+                'tp': highest_tp,
+                'conc': lowest_conc,
+                'model-code': model_code,
+                'max-model-len': 2048,
+            }
+
+            # Add optional fields if they exist
+            if ep is not None:
+                entry['ep'] = ep
+            if dp_attn is not None:
+                entry['dp-attn'] = dp_attn
+
+            matrix_values.append(entry)
+
+    return matrix_values
+
+
+def generate_runner_sweep_config(args, all_config_data):
+    """Generate runner sweep configurations.
+
+    Assumes all_config_data has been validated by validate_config_structure().
+    """
+    try:
+        with open(args.runner_config, 'r') as f:
+            runner_config = yaml.safe_load(f)
+    except FileNotFoundError as e:
+        raise ValueError(f"Runner config file '{args.runner_config}' does not exist.")
+
+
+    matrix_values = []
+    for key, val in all_config_data.items():
+        # Only consider configs with specified runner
+        if not key.startswith(args.model_prefix):
+            continue
+
+        # Optionally filter by precision and framework
+        if (args.precision and val['precision'] != args.precision) or (args.framework and val['framework'] != args.framework):
+            continue
+
         # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name
         # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME
         model_code = key.split('-')[0]
+        
+        runner_nodes = runner_config.get(val['runner'])
+        if not runner_nodes:
+            raise ValueError(
+                f"Runner '{val['runner']}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.")
 
         # Find 1k1k config
         target_config = None
@@ -336,6 +414,14 @@ def generate_runner_model_sweep_config(args, all_config_data):
 
             matrix_values.append(entry)
 
+    if len(matrix_values) == 0:
+        error_msg = f"No configs found matching model prefix '{args.model_prefix}'"
+        if args.precision:
+            error_msg += f", precision '{args.precision}'"
+        if args.framework:
+            error_msg += f", framework '{args.framework}'"
+        raise ValueError(error_msg + ".")
+
     return matrix_values
 
 
@@ -481,6 +567,39 @@ def main():
         help='Show this help message and exit'
     )
 
+    # Subcommand: runner-sweep
+    test_config_parser = subparsers.add_parser(
+        'runner-sweep',
+        parents=[parent_parser],
+        add_help=False,
+        help='For a given model, run configurations on all compatible runners'
+    )
+    test_config_parser.add_argument(
+        '--model-prefix',
+        required=True,
+        help='Model prefix (e.g., 70b)'
+    )
+    test_config_parser.add_argument(
+        '--precision',
+        required=False,
+        help='Precision to filter by (e.g., fp4) (optional)'
+    )
+    test_config_parser.add_argument(
+        '--framework',
+        required=False,
+        help='Framework to filter by (e.g., trt) (optional)'
+    )
+    test_config_parser.add_argument(
+        '--runner-config',
+        required=True,
+        help='Configuration file holding runner information'
+    )
+    test_config_parser.add_argument(
+        '-h', '--help',
+        action='help',
+        help='Show this help message and exit'
+    )
+
     args = parser.parse_args()
 
     # Load and validate configuration files
@@ -495,6 +614,9 @@ def main():
     elif args.command == 'runner-model-sweep':
         matrix_values = generate_runner_model_sweep_config(
             args, all_config_data)
+    elif args.command == 'runner-sweep':
+        matrix_values = generate_runner_sweep_config(
+            args, all_config_data)
     else:
         parser.error(f"Unknown command: {args.command}")
 

From 9b570de0eaba5333fbdd64944ce11556b3969ab9 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 10:41:24 -0500
Subject: [PATCH 053/149] adding script

---
 .github/workflows/1k1k-sweep.yml             | 12 ++++++------
 utils/matrix-logic/generate_sweep_configs.py |  2 ++
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 80bcca43e..dd8ae9f9c 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -70,8 +70,8 @@ jobs:
             framework: ${{ matrix.config.framework }}
             precision: ${{ matrix.config.precision }}
             tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
     benchmark-dsr1:
@@ -94,8 +94,8 @@ jobs:
             framework: ${{ matrix.config.framework }}
             precision: ${{ matrix.config.precision }}
             tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
     benchmark-gptoss:
@@ -118,8 +118,8 @@ jobs:
             framework: ${{ matrix.config.framework }}
             precision: ${{ matrix.config.precision }}
             tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
     collect-70b-results:
diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 10dc07c82..1c3472eb8 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -151,6 +151,8 @@ def generate_full_sweep(args, all_config_data):
                     'conc': conc,
                     'model-code': model_code,
                     'max-model-len': isl + osl,
+                    'ep': 1, # Default
+                    'dp-attn': False, # Default
                 }
 
                 # Add optional fields if they exist

From 2bb9dfaa17f4aaac2c63e60f05d84c363fd134e8 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 10:42:46 -0500
Subject: [PATCH 054/149] removing extraneous files

---
 utils/matrix-logic/get_full_sweep_configs.py  | 137 ---
 utils/matrix-logic/get_test_sweep_configs.py  | 184 ----
 .../test_get_full_sweep_configs.py            | 842 ------------------
 3 files changed, 1163 deletions(-)
 delete mode 100644 utils/matrix-logic/get_full_sweep_configs.py
 delete mode 100644 utils/matrix-logic/get_test_sweep_configs.py
 delete mode 100644 utils/matrix-logic/test_get_full_sweep_configs.py

diff --git a/utils/matrix-logic/get_full_sweep_configs.py b/utils/matrix-logic/get_full_sweep_configs.py
deleted file mode 100644
index 01e13f313..000000000
--- a/utils/matrix-logic/get_full_sweep_configs.py
+++ /dev/null
@@ -1,137 +0,0 @@
-import json
-import yaml
-import sys
-import argparse
-
-seq_len_stoi = {
-    "1k1k": (1024, 1024),
-    "1k8k": (1024, 8192),
-    "8k1k": (8192, 1024)
-}
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='Generate benchmark matrix from configuration files'
-    )
-    parser.add_argument(
-        '--config-files',
-        nargs='+',
-        required=True,
-        help='One or more configuration files (YAML format)'
-    )
-    parser.add_argument(
-        '--seq-lens',
-        choices=list(seq_len_stoi.keys()),
-        required=True,
-        help=f"Sequence length configuration: {', '.join(seq_len_stoi.keys())}"
-    )
-    parser.add_argument(
-        '--model-prefix',
-        required=True,
-        help='Model prefix to filter configurations'
-    )
-    parser.add_argument(
-        '--step-size',
-        type=int,
-        default=2,
-        help='Step size for concurrency values (default: 2)'
-    )
-    
-    args = parser.parse_args()
-    
-    isl, osl = seq_len_stoi[args.seq_lens]
-    
-    all_config_data = {}
-    for config_file in args.config_files:
-        try:
-            with open(config_file, 'r') as f:
-                config_data = yaml.safe_load(f)
-                assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary"
-                
-                # Check for duplicate keys, shouldn't really be an issue but with NVIDIA and AMD 
-                # separate configs this will help against any possible confusion
-                duplicate_keys = set(all_config_data.keys()) & set(config_data.keys())
-                if duplicate_keys:
-                    raise ValueError(
-                        f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
-                    )
-                
-                all_config_data.update(config_data)
-        except FileNotFoundError:
-            raise ValueError(f"Input file '{config_file}' does not exist.")
-    
-    matrix_values = []
-    for key, val in all_config_data.items():
-        # Filter by model prefix i.e., 
-        if not key.startswith(args.model_prefix):
-            continue
-
-        seq_len_configs = val.get('seq-len-configs')
-        assert seq_len_configs, f"Missing 'seq-len-configs' for key '{key}'"
-        
-        image = val.get('image')
-        model = val.get('model')
-        precision = val.get('precision')
-        framework = val.get('framework')
-        runner = val.get('runner')
-        
-        assert None not in (image, model, precision, framework, runner), \
-            f"Missing required fields for key '{key}'"
-        
-        # Check if this config has matching sequence lengths
-        matching_seq_config = None
-        for slq in seq_len_configs:
-            if slq.get('isl') == isl and slq.get('osl') == osl:
-                matching_seq_config = slq
-                break
-        
-        if not matching_seq_config:
-            continue  # Skip this config if no matching sequence length, this is possible
-        
-        bmk_space = matching_seq_config.get('bmk-space')
-        assert bmk_space, f"Missing 'bmk-space' in matching seq-len-config for key '{key}'"
-        
-        for bmk in bmk_space:
-            tp = bmk.get('tp')
-            conc_start = bmk.get('conc-start')
-            conc_end = bmk.get('conc-end')
-            ep = bmk.get('ep')
-            dp_attn = bmk.get('dp-attn')
-            
-            assert None not in (tp, conc_start, conc_end), \
-                f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{key}'"
-            
-            # Generate entries for each concurrency value in the range
-            conc = conc_start
-            while conc <= conc_end:
-                entry = {
-                    'image': image,
-                    'model': model,
-                    'precision': precision,
-                    'framework': framework,
-                    'runner': runner,
-                    'isl': isl,
-                    'osl': osl,
-                    'tp': tp,
-                    'conc': conc
-                }
-                
-                # Add optional fields if they exist
-                if ep is not None:
-                    entry['ep'] = ep
-                if dp_attn is not None:
-                    entry['dp-attn'] = dp_attn
-                
-                matrix_values.append(entry)
-                
-                if conc == conc_end:
-                    break
-                conc *= args.step_size
-                if conc > conc_end:
-                    conc = conc_end 
-    
-    print(json.dumps(matrix_values))
-    return matrix_values
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py
deleted file mode 100644
index b4b1366e7..000000000
--- a/utils/matrix-logic/get_test_sweep_configs.py
+++ /dev/null
@@ -1,184 +0,0 @@
-import json
-import yaml
-import sys
-import argparse
-
-seq_len_stoi = {
-    "1k1k": (1024, 1024),
-    "1k8k": (1024, 8192),
-    "8k1k": (8192, 1024)
-}
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='Generate benchmark matrix from a specific configuration key'
-    )
-    parser.add_argument(
-        '--config-files',
-        nargs='+',
-        required=True,
-        help='One or more configuration files (YAML format)'
-    )
-    parser.add_argument(
-        '--key',
-        required=True,
-        help='Configuration key to use'
-    )
-    parser.add_argument(
-        '--seq-lens',
-        nargs='+',
-        choices=list(seq_len_stoi.keys()),
-        required=False,
-        help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
-    )
-    parser.add_argument(
-        '--step-size',
-        type=int,
-        default=2,
-        help='Step size for concurrency values (default: 2)'
-    )
-    parser.add_argument(
-        '--test-mode',
-        action='store_true',
-        help='Generate only the lowest concurrency value for each TP level'
-    )
-    
-    args = parser.parse_args()
-    
-    # Convert seq-lens to set of (isl, osl) tuples for filtering
-    seq_lens_filter = None
-    if args.seq_lens:
-        seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens}
-    
-    # Load and merge all config files
-    all_config_data = {}
-    for config_file in args.config_files:
-        try:
-            with open(config_file, 'r') as f:
-                config_data = yaml.safe_load(f)
-                assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary"
-                
-                # Check for duplicate keys
-                duplicate_keys = set(all_config_data.keys()) & set(config_data.keys())
-                if duplicate_keys:
-                    raise ValueError(
-                        f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
-                    )
-                
-                all_config_data.update(config_data)
-        except FileNotFoundError:
-            raise ValueError(f"Input file '{config_file}' does not exist.")
-    
-    # Check if the key exists
-    if args.key not in all_config_data:
-        available_keys = ', '.join(sorted(all_config_data.keys()))
-        raise ValueError(
-            f"Key '{args.key}' not found in configuration files. "
-            f"Available keys: {available_keys}"
-        )
-    
-    # Extract model code (everything before first hyphen)
-    model_code = args.key.split('-')[0]
-    
-    val = all_config_data[args.key]
-    
-    # Validate required fields
-    seq_len_configs = val.get('seq-len-configs')
-    assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'"
-    
-    image = val.get('image')
-    model = val.get('model')
-    precision = val.get('precision')
-    framework = val.get('framework')
-    runner = val.get('runner')
-    
-    assert None not in (image, model, precision, framework, runner), \
-        f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'"
-    
-    matrix_values = []
-    
-    # Process each sequence length configuration
-    for seq_config in seq_len_configs:
-        isl = seq_config.get('isl')
-        osl = seq_config.get('osl')
-        
-        assert None not in (isl, osl), \
-            f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'"
-        
-        # Filter by sequence lengths if specified
-        if seq_lens_filter and (isl, osl) not in seq_lens_filter:
-            continue
-        
-        bmk_space = seq_config.get('bmk-space')
-        assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'"
-        
-        for bmk in bmk_space:
-            tp = bmk.get('tp')
-            conc_start = bmk.get('conc-start')
-            conc_end = bmk.get('conc-end')
-            ep = bmk.get('ep')
-            dp_attn = bmk.get('dp-attn')
-            
-            assert None not in (tp, conc_start, conc_end), \
-                f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'"
-            
-            # In test mode, only use the lowest concurrency (conc_start)
-            if args.test_mode:
-                entry = {
-                    'image': image,
-                    'model': model,
-                    'model-code': model_code,
-                    'precision': precision,
-                    'framework': framework,
-                    'runner': runner,
-                    'isl': isl,
-                    'osl': osl,
-                    'tp': tp,
-                    'conc': conc_start,
-                    'max-model-len': isl + osl,
-                }
-                
-                # Add optional fields if they exist
-                if ep is not None:
-                    entry['ep'] = ep
-                if dp_attn is not None:
-                    entry['dp-attn'] = dp_attn
-                
-                matrix_values.append(entry)
-            else:
-                # Generate entries for each concurrency value in the range
-                conc = conc_start
-                while conc <= conc_end:
-                    entry = {
-                        'image': image,
-                        'model': model,
-                        'model-code': model_code,
-                        'precision': precision,
-                        'framework': framework,
-                        'runner': runner,
-                        'isl': isl,
-                        'osl': osl,
-                        'tp': tp,
-                        'conc': conc,
-                        'max-model-len': isl + osl,
-                    }
-                    
-                    # Add optional fields if they exist
-                    if ep is not None:
-                        entry['ep'] = ep
-                    if dp_attn is not None:
-                        entry['dp-attn'] = dp_attn
-                    
-                    matrix_values.append(entry)
-                    
-                    if conc == conc_end:
-                        break
-                    conc *= args.step_size
-                    if conc > conc_end:
-                        conc = conc_end
-    
-    print(json.dumps(matrix_values))
-    return matrix_values
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/utils/matrix-logic/test_get_full_sweep_configs.py b/utils/matrix-logic/test_get_full_sweep_configs.py
deleted file mode 100644
index beee33aeb..000000000
--- a/utils/matrix-logic/test_get_full_sweep_configs.py
+++ /dev/null
@@ -1,842 +0,0 @@
-import pytest
-import json
-import yaml
-import tempfile
-import os
-from pathlib import Path
-from get_full_sweep_configs import main, seq_len_stoi
-
-
-@pytest.fixture
-def temp_config_dir(tmp_path):
-    """Create a temporary directory for config files."""
-    return tmp_path
-
-
-@pytest.fixture
-def valid_nvidia_config():
-    """Return a valid NVIDIA config structure."""
-    return {
-        "70b-fp4-b200-trt": {
-            "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
-            "model": "nvidia/Llama-3.3-70B-Instruct-FP4",
-            "runner": "b200-trt",
-            "precision": "fp4",
-            "framework": "trt",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "bmk-space": [
-                        {"tp": 1, "conc-start": 128, "conc-end": 128},
-                        {"tp": 2, "conc-start": 64, "conc-end": 128},
-                    ]
-                },
-                {
-                    "isl": 1024,
-                    "osl": 8192,
-                    "bmk-space": [
-                        {"tp": 4, "conc-start": 16, "conc-end": 128},
-                    ]
-                }
-            ]
-        }
-    }
-
-
-@pytest.fixture
-def valid_amd_config():
-    """Return a valid AMD config structure."""
-    return {
-        "70b-fp8-mi355x-vllm": {
-            "image": "rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1",
-            "model": "amd/Llama-3.3-70B-Instruct-FP8-KV",
-            "runner": "mi355x",
-            "precision": "fp8",
-            "framework": "vllm",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "bmk-space": [
-                        {"tp": 1, "conc-start": 32, "conc-end": 64},
-                    ]
-                }
-            ]
-        }
-    }
-
-
-@pytest.fixture
-def config_with_optional_fields():
-    """Return a config with optional ep and dp-attn fields."""
-    return {
-        "dsr1-fp4-b200-trt": {
-            "image": "nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2",
-            "model": "nvidia/DeepSeek-R1-0528-FP4-V2",
-            "runner": "b200-trt",
-            "precision": "fp4",
-            "framework": "trt",
-            "seq-len-configs": [
-                {
-                    "isl": 1024,
-                    "osl": 1024,
-                    "bmk-space": [
-                        {"tp": 4, "conc-start": 4, "conc-end": 32},
-                        {"tp": 4, "ep": 4, "conc-start": 64, "conc-end": 128},
-                        {"tp": 4, "ep": 4, "dp-attn": True, "conc-start": 256, "conc-end": 256},
-                    ]
-                }
-            ]
-        }
-    }
-
-
-def create_config_file(temp_dir, filename, config_data):
-    """Helper to create a YAML config file."""
-    config_path = temp_dir / filename
-    with open(config_path, 'w') as f:
-        yaml.dump(config_data, f)
-    return str(config_path)
-
-
-class TestMainFunction:
-    """Test suite for the main function."""
-
-    def test_basic_config_1k1k(self, temp_config_dir, valid_nvidia_config, monkeypatch, capsys):
-        """Test basic configuration with 1k1k sequence lengths."""
-        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        result = main()
-
-        # Verify output structure
-        assert isinstance(result, list)
-        assert len(result) == 3  # 1 config with 128 + 2 configs (64, 128)
-
-        # Verify all results have required fields
-        for entry in result:
-            assert 'image' in entry
-            assert 'model' in entry
-            assert 'precision' in entry
-            assert 'framework' in entry
-            assert 'runner' in entry
-            assert 'isl' in entry
-            assert 'osl' in entry
-            assert 'tp' in entry
-            assert 'conc' in entry
-            assert entry['isl'] == 1024
-            assert entry['osl'] == 1024
-
-        # Verify JSON output to stdout
-        captured = capsys.readouterr()
-        json_output = json.loads(captured.out.strip())
-        assert json_output == result
-
-    def test_multiple_config_files(self, temp_config_dir, valid_nvidia_config, valid_amd_config, monkeypatch):
-        """Test with multiple config files."""
-        nvidia_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
-        amd_file = create_config_file(temp_config_dir, "amd.yaml", valid_amd_config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', nvidia_file, amd_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        result = main()
-
-        # Should have entries from both configs
-        assert len(result) > 0
-        runners = {entry['runner'] for entry in result}
-        assert 'b200-trt' in runners
-        assert 'mi355x' in runners
-
-    def test_model_prefix_filtering(self, temp_config_dir, valid_nvidia_config, config_with_optional_fields, monkeypatch):
-        """Test that model prefix filtering works correctly."""
-        combined_config = {**valid_nvidia_config, **config_with_optional_fields}
-        config_file = create_config_file(temp_config_dir, "combined.yaml", combined_config)
-
-        # Filter for 70b only
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        result = main()
-
-        # Should only have 70b configs
-        assert all('70b' in list(combined_config.keys())[0] for entry in result)
-        assert len(result) == 3  # Only from 70b config
-
-        # Filter for dsr1 only
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', 'dsr1'
-        ])
-
-        result = main()
-
-        # Should only have dsr1 configs
-        # 3 bmk-space entries: [4,8,16,32] + [64,128] + [256] = 4+2+1 = 7 entries
-        assert len(result) == 7
-
-    def test_optional_fields_ep_and_dp_attn(self, temp_config_dir, config_with_optional_fields, monkeypatch):
-        """Test that optional ep and dp-attn fields are included when present."""
-        config_file = create_config_file(temp_config_dir, "config.yaml", config_with_optional_fields)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', 'dsr1'
-        ])
-
-        result = main()
-
-        # Check entries without optional fields
-        entries_without_ep = [e for e in result if 'ep' not in e]
-        assert len(entries_without_ep) > 0
-        for entry in entries_without_ep:
-            assert entry['conc'] <= 32
-
-        # Check entries with ep but without dp-attn
-        entries_with_ep_no_dp = [e for e in result if 'ep' in e and 'dp-attn' not in e]
-        assert len(entries_with_ep_no_dp) > 0
-        for entry in entries_with_ep_no_dp:
-            assert entry['ep'] == 4
-            assert 64 <= entry['conc'] <= 128
-
-        # Check entries with both ep and dp-attn
-        entries_with_both = [e for e in result if 'ep' in e and 'dp-attn' in e]
-        assert len(entries_with_both) > 0
-        for entry in entries_with_both:
-            assert entry['ep'] == 4
-            assert entry['dp-attn'] is True
-            assert entry['conc'] == 256
-
-    def test_step_size_default(self, temp_config_dir, valid_nvidia_config, monkeypatch):
-        """Test default step size of 2."""
-        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        result = main()
-
-        # For tp=2, conc-start=64, conc-end=128 with step=2
-        # Should generate: 64, 128
-        tp2_entries = [e for e in result if e['tp'] == 2]
-        tp2_concs = sorted([e['conc'] for e in tp2_entries])
-        assert tp2_concs == [64, 128]
-
-    def test_step_size_custom(self, temp_config_dir, valid_nvidia_config, monkeypatch):
-        """Test custom step size."""
-        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b',
-            '--step-size', '4'
-        ])
-
-        result = main()
-
-        # For tp=2, conc-start=64, conc-end=128 with step=4
-        # Should generate: 64, 128 (64*4=256 > 128, so stop at 128)
-        tp2_entries = [e for e in result if e['tp'] == 2]
-        tp2_concs = sorted([e['conc'] for e in tp2_entries])
-        assert tp2_concs == [64, 128]
-
-    def test_conc_range_single_value(self, temp_config_dir, monkeypatch):
-        """Test when conc-start equals conc-end."""
-        config = {
-            "test-config": {
-                "image": "test-image",
-                "model": "test-model",
-                "runner": "test-runner",
-                "precision": "fp8",
-                "framework": "vllm",
-                "seq-len-configs": [
-                    {
-                        "isl": 1024,
-                        "osl": 1024,
-                        "bmk-space": [
-                            {"tp": 1, "conc-start": 64, "conc-end": 64},
-                        ]
-                    }
-                ]
-            }
-        }
-        config_file = create_config_file(temp_config_dir, "config.yaml", config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', 'test'
-        ])
-
-        result = main()
-
-        assert len(result) == 1
-        assert result[0]['conc'] == 64
-
-    def test_different_seq_lens(self, temp_config_dir, valid_nvidia_config, monkeypatch):
-        """Test with different sequence length configurations."""
-        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
-
-        # Test 1k8k
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k8k',
-            '--model-prefix', '70b'
-        ])
-
-        result = main()
-
-        # Should match 1k8k config
-        assert all(e['isl'] == 1024 and e['osl'] == 8192 for e in result)
-        assert len(result) > 0
-
-    def test_no_matching_seq_lens(self, temp_config_dir, valid_nvidia_config, monkeypatch):
-        """Test when no configs match the requested sequence lengths."""
-        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '8k1k',  # Not in the config
-            '--model-prefix', '70b'
-        ])
-
-        result = main()
-
-        # Should return empty list
-        assert result == []
-
-    def test_no_matching_model_prefix(self, temp_config_dir, valid_nvidia_config, monkeypatch):
-        """Test when no configs match the model prefix."""
-        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', 'nonexistent'
-        ])
-
-        result = main()
-
-        # Should return empty list
-        assert result == []
-
-
-class TestErrorHandling:
-    """Test suite for error handling."""
-
-    def test_missing_config_file(self, temp_config_dir, monkeypatch):
-        """Test error when config file doesn't exist."""
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', '/nonexistent/file.yaml',
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        with pytest.raises(ValueError, match="does not exist"):
-            main()
-
-    def test_invalid_yaml(self, temp_config_dir, monkeypatch):
-        """Test error when YAML is invalid."""
-        config_path = temp_config_dir / "invalid.yaml"
-        with open(config_path, 'w') as f:
-            f.write("invalid: yaml: content: [")
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', str(config_path),
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        with pytest.raises(yaml.YAMLError):
-            main()
-
-    def test_non_dict_config(self, temp_config_dir, monkeypatch):
-        """Test error when config is not a dictionary."""
-        config_path = temp_config_dir / "list.yaml"
-        with open(config_path, 'w') as f:
-            yaml.dump(["not", "a", "dict"], f)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', str(config_path),
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        with pytest.raises(AssertionError, match="must contain a dictionary"):
-            main()
-
-    def test_duplicate_keys(self, temp_config_dir, monkeypatch):
-        """Test error when duplicate keys exist across config files."""
-        config1 = {
-            "70b-fp4-b200-trt": {
-                "image": "image1",
-                "model": "model1",
-                "runner": "runner1",
-                "precision": "fp4",
-                "framework": "trt",
-                "seq-len-configs": []
-            }
-        }
-        config2 = {
-            "70b-fp4-b200-trt": {  # Same key
-                "image": "image2",
-                "model": "model2",
-                "runner": "runner2",
-                "precision": "fp4",
-                "framework": "trt",
-                "seq-len-configs": []
-            }
-        }
-
-        file1 = create_config_file(temp_config_dir, "config1.yaml", config1)
-        file2 = create_config_file(temp_config_dir, "config2.yaml", config2)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', file1, file2,
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        with pytest.raises(ValueError, match="Duplicate configuration keys"):
-            main()
-
-    def test_missing_seq_len_configs(self, temp_config_dir, monkeypatch):
-        """Test error when seq-len-configs is missing."""
-        config = {
-            "70b-fp4-b200-trt": {
-                "image": "test-image",
-                "model": "test-model",
-                "runner": "test-runner",
-                "precision": "fp4",
-                "framework": "trt",
-                # Missing seq-len-configs
-            }
-        }
-        config_file = create_config_file(temp_config_dir, "config.yaml", config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        with pytest.raises(AssertionError, match="Missing 'seq-len-configs'"):
-            main()
-
-    def test_missing_required_fields(self, temp_config_dir, monkeypatch):
-        """Test error when required fields are missing."""
-        # Missing 'model' field
-        config = {
-            "70b-fp4-b200-trt": {
-                "image": "test-image",
-                # Missing model
-                "runner": "test-runner",
-                "precision": "fp4",
-                "framework": "trt",
-                "seq-len-configs": [
-                    {
-                        "isl": 1024,
-                        "osl": 1024,
-                        "bmk-space": [
-                            {"tp": 1, "conc-start": 64, "conc-end": 64}
-                        ]
-                    }
-                ]
-            }
-        }
-        config_file = create_config_file(temp_config_dir, "config.yaml", config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        with pytest.raises(AssertionError, match="Missing required fields"):
-            main()
-
-    def test_missing_bmk_space(self, temp_config_dir, monkeypatch):
-        """Test error when bmk-space is missing."""
-        config = {
-            "70b-fp4-b200-trt": {
-                "image": "test-image",
-                "model": "test-model",
-                "runner": "test-runner",
-                "precision": "fp4",
-                "framework": "trt",
-                "seq-len-configs": [
-                    {
-                        "isl": 1024,
-                        "osl": 1024,
-                        # Missing bmk-space
-                    }
-                ]
-            }
-        }
-        config_file = create_config_file(temp_config_dir, "config.yaml", config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        with pytest.raises(AssertionError, match="Missing 'bmk-space'"):
-            main()
-
-    def test_missing_bmk_space_fields(self, temp_config_dir, monkeypatch):
-        """Test error when tp, conc-start, or conc-end is missing."""
-        config = {
-            "70b-fp4-b200-trt": {
-                "image": "test-image",
-                "model": "test-model",
-                "runner": "test-runner",
-                "precision": "fp4",
-                "framework": "trt",
-                "seq-len-configs": [
-                    {
-                        "isl": 1024,
-                        "osl": 1024,
-                        "bmk-space": [
-                            {"tp": 1, "conc-start": 64}  # Missing conc-end
-                        ]
-                    }
-                ]
-            }
-        }
-        config_file = create_config_file(temp_config_dir, "config.yaml", config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        with pytest.raises(AssertionError, match="Missing 'tp', 'conc-start', or 'conc-end'"):
-            main()
-
-
-class TestEdgeCases:
-    """Test suite for edge cases."""
-
-    def test_empty_config(self, temp_config_dir, monkeypatch):
-        """Test with empty config file."""
-        config = {}
-        config_file = create_config_file(temp_config_dir, "empty.yaml", config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        result = main()
-
-        # Should return empty list
-        assert result == []
-
-    def test_large_conc_range(self, temp_config_dir, monkeypatch):
-        """Test with large concurrency range."""
-        config = {
-            "test-config": {
-                "image": "test-image",
-                "model": "test-model",
-                "runner": "test-runner",
-                "precision": "fp8",
-                "framework": "vllm",
-                "seq-len-configs": [
-                    {
-                        "isl": 1024,
-                        "osl": 1024,
-                        "bmk-space": [
-                            {"tp": 1, "conc-start": 4, "conc-end": 1024},
-                        ]
-                    }
-                ]
-            }
-        }
-        config_file = create_config_file(temp_config_dir, "config.yaml", config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', 'test'
-        ])
-
-        result = main()
-
-        # With step=2: 4, 8, 16, 32, 64, 128, 256, 512, 1024
-        concs = sorted([e['conc'] for e in result])
-        assert concs == [4, 8, 16, 32, 64, 128, 256, 512, 1024]
-
-    def test_conc_end_not_power_of_step(self, temp_config_dir, monkeypatch):
-        """Test when conc-end is not a power of step size."""
-        config = {
-            "test-config": {
-                "image": "test-image",
-                "model": "test-model",
-                "runner": "test-runner",
-                "precision": "fp8",
-                "framework": "vllm",
-                "seq-len-configs": [
-                    {
-                        "isl": 1024,
-                        "osl": 1024,
-                        "bmk-space": [
-                            {"tp": 1, "conc-start": 10, "conc-end": 100},
-                        ]
-                    }
-                ]
-            }
-        }
-        config_file = create_config_file(temp_config_dir, "config.yaml", config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', 'test'
-        ])
-
-        result = main()
-
-        # With step=2: 10, 20, 40, 80, 100 (last value is conc-end)
-        concs = sorted([e['conc'] for e in result])
-        assert concs == [10, 20, 40, 80, 100]
-        assert concs[-1] == 100  # Should always include conc-end
-
-    def test_all_seq_lens_in_stoi(self):
-        """Test that all defined seq_lens work correctly."""
-        assert seq_len_stoi["1k1k"] == (1024, 1024)
-        assert seq_len_stoi["1k8k"] == (1024, 8192)
-        assert seq_len_stoi["8k1k"] == (8192, 1024)
-
-    def test_multiple_bmk_space_entries(self, temp_config_dir, monkeypatch):
-        """Test with multiple bmk-space entries."""
-        config = {
-            "test-config": {
-                "image": "test-image",
-                "model": "test-model",
-                "runner": "test-runner",
-                "precision": "fp8",
-                "framework": "vllm",
-                "seq-len-configs": [
-                    {
-                        "isl": 1024,
-                        "osl": 1024,
-                        "bmk-space": [
-                            {"tp": 1, "conc-start": 32, "conc-end": 64},
-                            {"tp": 2, "conc-start": 16, "conc-end": 32},
-                            {"tp": 4, "conc-start": 8, "conc-end": 16},
-                        ]
-                    }
-                ]
-            }
-        }
-        config_file = create_config_file(temp_config_dir, "config.yaml", config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', 'test'
-        ])
-
-        result = main()
-
-        # Verify all tp values are present
-        tp_values = sorted(set(e['tp'] for e in result))
-        assert tp_values == [1, 2, 4]
-
-        # Verify correct conc ranges for each tp
-        tp1_concs = sorted([e['conc'] for e in result if e['tp'] == 1])
-        tp2_concs = sorted([e['conc'] for e in result if e['tp'] == 2])
-        tp4_concs = sorted([e['conc'] for e in result if e['tp'] == 4])
-
-        assert tp1_concs == [32, 64]
-        assert tp2_concs == [16, 32]
-        assert tp4_concs == [8, 16]
-
-    def test_output_format(self, temp_config_dir, valid_nvidia_config, monkeypatch, capsys):
-        """Test that output is valid JSON and matches expected format."""
-        config_file = create_config_file(temp_config_dir, "nvidia.yaml", valid_nvidia_config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', '70b'
-        ])
-
-        result = main()
-
-        # Capture stdout
-        captured = capsys.readouterr()
-
-        # Verify it's valid JSON
-        json_output = json.loads(captured.out.strip())
-
-        # Verify it matches the result
-        assert json_output == result
-
-        # Verify each entry has the correct structure
-        for entry in json_output:
-            assert isinstance(entry, dict)
-            assert all(isinstance(k, str) for k in entry.keys())
-            assert entry['image'] == valid_nvidia_config['70b-fp4-b200-trt']['image']
-            assert entry['model'] == valid_nvidia_config['70b-fp4-b200-trt']['model']
-            assert entry['precision'] == valid_nvidia_config['70b-fp4-b200-trt']['precision']
-            assert entry['framework'] == valid_nvidia_config['70b-fp4-b200-trt']['framework']
-            assert entry['runner'] == valid_nvidia_config['70b-fp4-b200-trt']['runner']
-
-
-class TestConcurrencyGeneration:
-    """Test suite specifically for concurrency value generation logic."""
-
-    def test_conc_progression_step_2(self, temp_config_dir, monkeypatch):
-        """Test concurrency progression with step size 2."""
-        config = {
-            "test-config": {
-                "image": "test-image",
-                "model": "test-model",
-                "runner": "test-runner",
-                "precision": "fp8",
-                "framework": "vllm",
-                "seq-len-configs": [
-                    {
-                        "isl": 1024,
-                        "osl": 1024,
-                        "bmk-space": [
-                            {"tp": 1, "conc-start": 1, "conc-end": 16},
-                        ]
-                    }
-                ]
-            }
-        }
-        config_file = create_config_file(temp_config_dir, "config.yaml", config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', 'test',
-            '--step-size', '2'
-        ])
-
-        result = main()
-
-        # Should multiply by 2 each time: 1, 2, 4, 8, 16
-        concs = sorted([e['conc'] for e in result])
-        assert concs == [1, 2, 4, 8, 16]
-
-    def test_conc_progression_step_3(self, temp_config_dir, monkeypatch):
-        """Test concurrency progression with step size 3."""
-        config = {
-            "test-config": {
-                "image": "test-image",
-                "model": "test-model",
-                "runner": "test-runner",
-                "precision": "fp8",
-                "framework": "vllm",
-                "seq-len-configs": [
-                    {
-                        "isl": 1024,
-                        "osl": 1024,
-                        "bmk-space": [
-                            {"tp": 1, "conc-start": 2, "conc-end": 100},
-                        ]
-                    }
-                ]
-            }
-        }
-        config_file = create_config_file(temp_config_dir, "config.yaml", config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', 'test',
-            '--step-size', '3'
-        ])
-
-        result = main()
-
-        # Should multiply by 3 each time: 2, 6, 18, 54, 100
-        concs = sorted([e['conc'] for e in result])
-        assert concs == [2, 6, 18, 54, 100]
-
-    def test_conc_exact_end_value(self, temp_config_dir, monkeypatch):
-        """Test that conc-end is always included even if not reached by progression."""
-        config = {
-            "test-config": {
-                "image": "test-image",
-                "model": "test-model",
-                "runner": "test-runner",
-                "precision": "fp8",
-                "framework": "vllm",
-                "seq-len-configs": [
-                    {
-                        "isl": 1024,
-                        "osl": 1024,
-                        "bmk-space": [
-                            {"tp": 1, "conc-start": 5, "conc-end": 50},
-                        ]
-                    }
-                ]
-            }
-        }
-        config_file = create_config_file(temp_config_dir, "config.yaml", config)
-
-        monkeypatch.setattr('sys.argv', [
-            'script.py',
-            '--config-files', config_file,
-            '--seq-lens', '1k1k',
-            '--model-prefix', 'test',
-            '--step-size', '2'
-        ])
-
-        result = main()
-
-        concs = sorted([e['conc'] for e in result])
-        # 5, 10, 20, 40, 50 (40*2=80 > 50, so we include 50)
-        assert concs[-1] == 50
-        assert 50 in concs

From 34ba318804a5aada119a4609c7cd6e132e8843fa Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 10:43:26 -0500
Subject: [PATCH 055/149] removing extraneous files

---
 .../get_full_sweep_configs.cpython-313.pyc      | Bin 5046 -> 0 bytes
 ...l_sweep_configs.cpython-313-pytest-8.4.2.pyc | Bin 55816 -> 0 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 utils/matrix-logic/__pycache__/get_full_sweep_configs.cpython-313.pyc
 delete mode 100644 utils/matrix-logic/__pycache__/test_get_full_sweep_configs.cpython-313-pytest-8.4.2.pyc

diff --git a/utils/matrix-logic/__pycache__/get_full_sweep_configs.cpython-313.pyc b/utils/matrix-logic/__pycache__/get_full_sweep_configs.cpython-313.pyc
deleted file mode 100644
index b29a85d117b2da207801b8a1143e67830ad9f45b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 5046
zcmb7ITWs6b89ovz>wcH4i)DwFmB_LqyQ$qaPMdaVF3!zyD~4%Or4ST}c2rB0LP|}n
z4PEvyK-~t^K|93ROZ@1f1DeM^^ua)}p%{jhno@{igTO%d5@5ibC0McbVfzmyN=_OT
z9YgT^|2g0J@Bh(p9$Kv?1kYzb{qM!2E`+|pjQWsm;$Z@a-y#~(niGg`jT&B~@LFE0
zj08{6TAH9qS~o%PB;6N8yzYD-t>1SRp|k;^^SUs4t}T#4i<(7E%y1v`r()Xs2vqR|
zFfpw%+((o`V+&gIgiP*P+Oveed*Qnm=nO1&9t%)dz(ZM&ZB2F-650;gA;RDZt)0<u
zh(QTmXSzeQ{Gh@58ItOSF(g4#>q&xtttVktsqUzH{hX#XN9$-KZAw_X2$lN(<k--Z
zmNx%@g%;3crEN1rPao__o2r|3&`#RbmgZDvyQAs_r|DYBXcAB}Pd+%H6^MFi<Qc?~
zfbl$uK*)`bevS~Q8$wB+y2|Ki_jB!RX%W|#^s6h0ph|VeFk<@Dh;sVmpt{27lS3-4
zN*irYWFb2~M0*&W4bk2yLiLH&B!|^K4EbYnxDBuWG59#-o-;58)sl1!q2!3FlkQh6
z^7W{Sohptos2-~$)Bvf{tOjh4s@U$x=v4%Tdl&wxaiP%M9}qG_vrQt-2)oC-yHrYz
zksHlwj{d!!*^e1hBCaC3BjkFb%c+#AUG+T+Ii3TCF(>w@nC?giRXk^5ES){A%ndLm
z$XsZ9=6XmO>`1ZpKn!&dwjPKb9fZ9Hg6bd~JrKhkgp)>R?=g19!8oUNFG4-Et1<k2
zpJ>Fn7z^~s6jY6yaX&RqBdTOZi_cgks?aAr(2)+B=%69`jGJ>&XkLqJfN^!AjWhIb
zSam&B^G;P3=V3gZJ*^Dw`o1@e2R!)`c;aQePn8$#$m{Qc80{c@JrJ=D!ruci)<Fb%
zAjUh0AdQ%Q#>e=liEWwM{S@DP+yFDc`1kE=XW$)(nw7D&^2}%%0^|(P@d+arVgeW7
zXk-FRh#tdj$^;cY2>iIh8xxT4Cm$xHT16p66aGEL`S}<m3_!<CKe+>;IGoCDyFP58
z8GqMKW%p^2@aI<V&uZrW(V~(ssIroKRVo3VJ^8@dYgDaLN3hNqLBnV_>Qn^O_=1)~
zJD|2GG^?Ha4x#<T4z!>dMoATf^eF8>u)ggy^ed-f0J<C`;D^;Lw0FiPPF0_sWjU5l
zms#op%VjPW()<!tNSFEC6>5Pm7N|^-TgWX|0Ff(l)Iu)L3Nejr&ay&=&z10G)fSJp
zP;q5h){LL7#^*VfD)LmJ$g@wAL5-d}Fn5w#DDt2z79(UGm*y7**}$`xDmk9b%H%~h
zU#gnoae=)Q&$FCR-9f_|7|Bw=E|xDoE_WYQ)$XQ1l0I{>n9HyNk8d$Nz8GK&3&kv(
zkC%9MA$O&^a}IM<i=)a#>{prP9~UUV^{Feu42G21Qe4PY+57l+wHl{^r!bG*z<V<l
zo(KDusdv)(3M){fS#}{^$(Q$0&tezJ@^Xol_3cT;GFih{tp}A5E3u`b)qyBgs0d{&
zubj?tRGP}>GC0m@ez_VwR4L_i8Jw<G$koIwvC9IOQQ@)_?1@&1D8$2Zj4M^js$gf7
z$`-*Yu2`nnD><P&QMI>Y+s>a*9hu7sLXKOcqED<u6(2zh6_qtnS(huM7g<@SL{&CI
zATv1uY?O@)e7eA1F7iuqA7A01w5r3ZOf|J&5S!0pb9t&Ik;n;onJ5bRs_{Z$DK3=K
z8MeAt-52f5MX_^GFqw<k#>Y+ECQe(1!om4~l%@G{)u7NUmzA|_sjAPG;^}gktNx^;
zpj?XXrlQ>)z=3V83D~Bn?HJKj6f^SpHpYALryqat^^3<hEk1xh_@>1Vz(*#r@Ys-S
zfGRJgd4ZK}2l&NGf#u4tVV0LI>1;LytIC{A<{?3{1zb!)v8RM`F(;Ea$ujwNF~`Y7
zjw{P1MH1cwjnjq-vISH@%cfe*NES;hCzH$RLS8ls=>;~GFQ&7y3DU(ubfBAL5*L_E
z2rNiEgDdwi&lh>QPbl(bXvDrsDGLQBJC8u~pDLD*;3{iDbh$EH5M+}QX5n(Kd{MS2
zC6Q{imP{;y(7tpDY{-IoXqXgk7+F{1u><7W0@REyTPc);*r42}_`}O2_NW^{*`Chl
zQ*E!nl(cN#=Ax`w=<v#-EvxoON(q>?b)2PIO(_#FDHFiRxcZgowxkI?lPSB^noHp=
zEjR=$e@Vvmno{ajCV*@uTd6(B8PuMv)LshQ=R>#-p!ia__;`H9mUWPRep$dY^+S(*
zY?6T*5hmemg!a#6CXaCoEIw=4xdUe>E9G2Xm~6Qj&leYA9pau^s6g0+%Pa)F?cqde
zStbi`s_?IYN_@BpQ}9>eRdm<tZCWFLwMHb*(FcUq)_3*DRv*%to20K!`Xqy2vN<KE
zSF*meWkfpv1EkaSZCSPfJa-+Q56?HfyXxLu4ew~f5yNo%#CrLY<&Ty>3pHns*JqA5
zW==LH=fsIqqIJH9Aitt!dP}dUAwi94n;P$3t3z@)AGgDXEUu<GR5ynj=Ao;H{$?Lo
zJ#=&K#$4S#_KTxe4{rARug*)pfu?Wtwr{j%-qPBQCpCAS-j$`bsoT!z`kuQ(;hWs=
zxb@c?Lz64!yT0MI@!yx5zUjJeT6A!qS~r$9-Vzf9@p-P{;6&f_BLfQWTuU`~&eV6#
zG<Lq&7=Gz9VdKT-LArjBZX9G9ubin5pINcpa}VDQMx@|@jq#1aFEV1{Of!+JCz7|%
zUl22w8t3^&LJ(gsi~h<(LOamEg|y!OEdz2=O~<afW7pbGZ#$;G4ItNO-5GtLL;l!S
z1bKZ|=OoWa(-W(EVolHZdxjdZX?MN<?tAa9W}8Ek^`Xh8{h7M`nL8HGW^nLk^hQ(+
zPc?$mHTymLkZ3q5h7WIKHkdDlMe1b3aI*IEO^5rs{+fPO-*8Yho%A^8@m-(2HoJPF
z;Tf$NrN=p^=eqBjZ`I#$hHHJBcK7Px_1LX*8-3!yY{Ndgg*3(&B-`XIV!d+ffM|LC
zApsL2EEv#^5z#R4wL4rpD1qsh-@Ck0`SrVN?yu~Tb@wN}k9@bh9}j&x^;PU;X;(~i
zjNEZ}ALx;1<X<KP4EW`Vg6DP5_&V6Nw`RO&_lt(47&<L7XT{U!#B*<o!DPdbtSx`z
z4uOugp`H&0Yp+Uy&`sYB-&&{<n5Z3>e8KCNuU%fPHhg2XqmKjo1J@U?Ew1Jo{iC(n
zO}B4NyLSARLEJIZaL<5q);BehD<RTn#Karo8)rqwIdBjrMOZZ;&rt1<<mz7?{mK>I
z91KhF4v*n?C?X9E;WrqPLevAdDd1YsZ3U61|Jut^II?nhHFE8Q<PO%|yQJs@u#o4-
zrem<_h}0dCHRiTs9Bf(L+i;KE@dPA?XO+BZyJ3TJSs^7@82#WCX<%gK(AE^fT+<bP
z6hNNA2m6tC_oL%lq%-{{^8VPb##Y{Hm_}}s5y|WqN&mM+EtFZ)8U5~25IKe)LhhhN
zT%pyUt)E!mFYY}i4jyl~PKcHhTad+*n*Rv+kK51tE%OfT=iyh!Pwm(J(P{wZFZ)gN
zR^yj?0_ZQT%GVK`AJfT(R0<zFsZ>nQ<4Z6<1|K*>mjxdG4+9^@AABldBnNkhRO%!2
z5O~DTE3+@b4@)J_zRaHi7GGtBm*E5EXf$6Vv+|n0Ms_&1tO3d5hxdV=bZD<y{%NUd
cG=Z%v1R^c9YJ&*eAq~GgvIT6AXF$aN0CKX6UjP6A

diff --git a/utils/matrix-logic/__pycache__/test_get_full_sweep_configs.cpython-313-pytest-8.4.2.pyc b/utils/matrix-logic/__pycache__/test_get_full_sweep_configs.cpython-313-pytest-8.4.2.pyc
deleted file mode 100644
index 4a9926a7c582b3781de9dd64d8422a16e2d1dcb6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 55816
zcmeHw4Rl+_btXXk@DT(Ek|4ofk>I}|iKIm8&$50jS=Ntj(XyUltCbxrM1mA-3ZNfI
z*`nLVNt&2W(ne0>6Q*%nrA@b@yQj6Y?WvP=&)V5`<EHI703nGCqPEUYx2x@DSC*3F
zSm$iB-<^4H-UIkZg0kdw)_Q_2XXee!dvE5>ojdp5xpU3qaanME{^g&aXc@Cuen=7R
zva5k>2doy$ix$};Tko?dR`$2(UR$y#V7b?>I0F2RQ*p98#Y!>zD=8BDTcVV(zb?gv
zzeV?z-s@J}tc>kG&%I?z*}Y!H8?Zd;JQC0ztpAnm^_ER)RH;}bkHe6sa*;evLmuBE
zd5R5rs${9&qEy#fS}k%(y+w9yD&nu9_*q&h(%d4ghNXFsRwmMFS(+DV<sz+a#2T!)
zOcf0Tty%lw$oPq1QPzGk5{qZu$>@{e(P%uJ98bgs(c<EwunqsOIRRd@ge>irpmnn6
zZY3HSj|L)v@#x91gc4Cs2ZmxwbZ|U@&~QQt3?|~kv5~-VY&4qenL(MXG=6d{Jcg$x
zsm?%lpf{9^D#_j_Ct{<q!PsE$!T4}giN*(`hjt(BofwaeCVNjt#+BI9-J^*S+#io7
z$HODh@$m4(=x8|kR5Usk=8sACjGfMU=~dK(FkVI><f8w{5`bqdSDY4i)w2&=L&P#j
zK)XB$uur@f0ic>yE|P7sU3SP$xfuT>xdi`QJvO;icJokgk?cXdST2*ja=Dsjr8L~B
z=y9&MqM0fuyN*Q1CzLpv=~QGiHWV0m_~71yyS27Sjw=&`xF7AgUcP1h@$TWVjorul
z*01j#SH`dR#ZL_?J+VYv{8Vfx7Qx5IlL<u`?;agJ*{eiHqmg8E+lHPEJ?oXhzMio}
za=dS{Ma_HvXyjz1yT7Nud&~Mg-3R0RLEZZfZ@eyP56If^xU3D2x@^H%yG(s@`B{J&
zTefI??79mrJJ_8ZkKkP-7LLY;E)yPJzKiFw#ge00TOv6+S#td36Wz(N$YAucC4dYx
zb}yGxE@##eI~f^?W*sLJL($Q!Q<;dzqe`}947d_Y#uD*t$*>YR8GS0DJaOF%EbImr
zcB{Rr6r&CfN-0GBnDw!y4D(M_yIWa}JM_ThC;+-zvXnH;S09|O-a79O%-1#Hhwg6-
z%r`W@)f{-m_PQ@!d(S*VMX%e^wR^8PY_0yci;F8NuT<)})O)rfy640_UwNSHZBNZA
z*A-ix+j+%O=Wr_YVma94Mgqg8bqFkiO)j|<m|~UP40=?2DVuE1!<XHM_AW%0Ngqd+
zt=%VM{hKyD-F*ruF}X`g44&-W(zAXW1@JdKaeN{^J`wKg>sh}sOnJiVdp6(~jsYQJ
z@xk$M-}=5y>+k5>5>^H`OtwT$4)rdIIa|B$dpPUl4<yvtnzd7vN+mv0@c~>8P^g|j
zHGu{KEd<Ouv7C1z9;x+U53=^+e{vrH@4=?|>W5hmQuoc*H5+@d==Gv>?cG=OzOhNv
zH{JmHrh@vW!mMuyMOfd^LrnXI#yX933IAxcdwT3FOye9Qy~ojO!}zAYt5kc<H@Q*t
z8pqqfQ?c<AfyCH2&^t03h>is!@u9%bSa)Q6JPwK&9UV&cOjZmfl?^##UMa&{O*W~6
zcrVDC9F0EFePl!T`b~XXY4~<O+^2YvNudu>DhRL+Xr@p%0oL}c7pf2q+7&<kWQ(F>
zN)6qv1$gMP1%o~4V7HX@NV#6DzMP(Enf`0wUmB(_4QpQ-=AWkYqe^PvWCVbBWg{Dm
zn@k6z^))43y9a~uob|QJH!&~|vG6NHZ?>h5eI#8QzEW(b5h!5<+FaDj6;0@6AN8_N
z>t*Uy?V_2A_3|_VFItXTyD>=Pwqy8eeuZqZRht%zEOGlWb$Yc}ju7d}yIL(r+qK+a
z9g66m9zRCt&90Dbllp{qON(Wlg=i98Lr5Ji3Rs#g0sI@N+hJ?A3|oV?$@T-$(Xpr!
z7*7NSwdwSib|1Q58}c-F_GGP34Fv63dtxja&)QE%PL5{nLlY;*vNlllpaZii&3c&b
zvJ&NwgO+AX)!rG3j7LO-X5OrIIO)JE1?b}Bw30tsn>VQrWf!u9@jrPdz_XUOD(lmg
z9nao(sl+q&#B~4plIF7;E=lgsm3+G7%-Zu(%R4rUEAXSzvZ-e>r41>2!#fVdFO)Q2
zXA|!WHFw&+>DpaVJc!D~rM8si+CxZs(efx`?UW+fs@UWr#V*^x06B;O17imk3v3;j
zC@^(klfcpud*mY4NGX+FfNmKqk>ZivfMv2r@s5-Qz1i}Iz%U#Fd$Vr>)DDC3@(3O_
z*+R*I<U|bo3vnvrC!zsjf&#;8ww^(1Bkdv%f??Bf45b$>1}w75)=<$Vx`!+72-%LP
zVV=Sv=Osc8m79=yiw(FVo};muai;;FoW~@sCap~3DGjUkTZh(Kz#hu3WAujXIzn?#
z-qk8tSnA9nP>N$jX4rLv$VlGRYB@&Zhg}*zYtM{;uoy}c>B+myN<UzgG2gA|Eiri*
zo@4r!A?Jd(L;n~`nZBJ~3U3AazM{7(r0w(`Q!#u+ZMUE_-qOA7Hx+9wL9<midM_>=
zqj`W`hSD?@<Xx?nkffE?{zk2GS-kjIot9N^vv`T@P0WN`+YVSPiEnCO222D8ZFmk%
zKY14xGUcJtjaIqh=x=W<+N4rhx$>sBKTUA#(!NG}%YtxsL20yq<p&dY4ef1?=_OhV
zESuA%<&=FY{<!5TxjN)tZ)MozSphbAw7RsvgiSlF@sb_Zv2TQ&KFj{%1IK}(WwJl<
z^H3Slq{NrBJ{tPN0`RCNR02GzJ^F7t9xeNYUhQq<x)otuA#|urt6TeuNFj7clmdp;
z$@M)&tfkB42Dvd*uC{x{3fjFwdkgI^Z}+ix<yE)anFzuyZ|A1l?fky9bMx(XeqY)-
zaJ!w~mv(Ns-OlezJGb6$=l7+Z+ith>`_j(sx7+#sXy;Iw+(KfSxL58-d^A+d*1r0R
zB~ieNrJSu;I*-0{BP*6=*Eag<ze`<3Nb+hq7?RXggnI?62)DL+(Eh5cH!oIiUqnl^
z-fjtMiT%Zgq83Xjq`4vckyV6+xtEaURUT{7?wBc<A$_Sl(yHaq{~qbk!&*qIQ|p<y
zFXRh32^-^8p(<_dIdrKYe5}F>dJUD=F;ZT8(^tkm?OnCED`;gbN@2ye_Jb*Vk=Lms
zty=DudqUOfNPF*6w<^f3dnY$C0rl}nGB&8H0w$CQ$*;L$X~Pp6o(MowIzg)P07U*H
zP^tG^FXeip9#Ropmy)NGJrQN()b*0&pb{G!hl2ciS+}ZK?Pgl}>#pu@RpFY<+NoB`
z!SuA<P|ps>p3d5!X}rEEl7yaqJkYUv?DWQ9()ElQ2<%`<>x0SFl(He1T*ngng2|xk
zSF<jtccEyE;3c8<jV32X$Fug)SaN*Ep0$mgUQb{Hfj$Bo2QHKFe8zgcXvdBj>*RW>
zvR``|0t=sphXi);ci0?EW^HJR8N2czYEg~=P_6w0HeW9@e8E=AI(B-iLgQ}6nk|aO
z@wiI$3j$CBs87y1qVaL%^h^<}O*w|_ITdZ<6>U=5=!rC9l}-Zf1R4ntW#IZZI}|d5
zOjT!mmiqE&Q|%$749R&2K1_KOiCQr<w)V|v=4|QX$wWMym>3_M7)KrK+vWT^P`^SI
z2YpPjlPyk0$DxT-6q3R!hX_0XkagcRb~=0<#u!izC$rAc#9(AJnROwR8D2z`Y;8Ds
zA~7*K6jq>543CT^jz>nrOyg7%PDaRhV@%1G?Pll#biGGW5=zz=hLQz#9O3BGW22Ec
zLuBQb=z$esJYkeoHkdd$7Ez*E7xZ95_@1buWbO2ES;uH1GK4NeV%7nbS8Ock&)SpG
z(P3ua!8HHb(vyk!6VcO9aSfixItL?TK!2h5Vk)fxdSEGiA;*P&9^tRH$#|#F7|FS_
z(O71SF>;3jnPyJ?>y5W_3U1CyctJAz7*d|dE49H`sy|5rd#0*e0J!eX&pL(?3Qd1U
z!#LYT1jMNL>M_swIO<P+7jycvmLK_7pKE@tJ9Y5UjQ_Ez(zhhvoYat(8m8}^ch}6j
z8>gR`ch^pLQYd<*#1VA;#NsF^{Yjan(*IljDd$g|mh$@P&abv-yj>}&>!P=7PRjYE
zrLNCTinQDqPrfL1O?Mh|7-AyFSKEy_wAclyD`ob?OWw*en?LhRN@|{tFp%~(&q?Yp
zBQ;MwBVs&oL2CYNQlzDGVm$ex)O=>MF^3jIQLQ@EJRK2vL;%TXCV%79J@dZ$l)G&@
zn)bELxz*pSyY0*t9y_lG#ciZbM@1~16BEf9ciSzkx+&%EI@^)<b<Mfe->kdq%$+=T
zUJr`fNITmhV(FZiNY1#s=6$!Y>V}lNeR?A8YoBwgzgc(tnXNo_UJr`fNSmGzv2;#M
zBxl_1^S;`YyJh;G7k19MTljC*-Ew9V51!Y9;x^Kz?-8+dPD~_c+%31V=9-i{Fum!8
z9dqsg|INAsXEyTSc|9m@BW)U=q-IX%#6)t&9hmpkrQEI4p%-?|xm)>f*4>KX!D8q2
zpty~+>5zz}b7CSn<8DQ3Ojp0Kose%M1Nj)ZUB^H*2%I{LZ_*K!hZ(BftjaFByWXzB
zc)rqTsj8pzwFAR4zRoG>lE40okAMF0v$cQJ_?5=Fmi}~0f2L(q#=m*0blzJ(J)HJ}
zh^fDf)IRmh^stEYzy(SbY3ZC8PrfL%8%iOEAtnl<a&Ddm3NJ3QS0hgA+IgfdWzLXe
z=uiaX7S*^R?!=-;4U1Y4;3kttA7)0AFr(^~G^2%p<(PN_Z5aavvu-#emauV2b|t1m
zCEI=-YQdMaWrM0<mb9lG5e5%hiiIp?OG7R!YurblH!NyA$An455n=XVETz2<DI%G1
zg#Hv$vx8-fBlOayO4)^_vB>erWh;71QcCL2F@4LBXwlW`$FilDUdr_C^ip`MGQ%^>
za|^?PWpfHuwCq(C%%w1ztq7Hxh$!@TG&ff;6H!z`!7O26J`*bTSswNzi#Ax2MLx>`
zU|8ZGWFH}4tmqJ|M`_6(FN=HRDj__Z(prcqpY^Ak<5+b_0*?7}l+St@^LEme&owu_
zpM@G(`-+14S(HLQ*T}U(`3yz0sq$I>Li0W`Q$C|l#(pL@+-_$YfVaAxM=EJQM6xLE
z8MXu)Cj-R)b78?rEW2Z4qfw(xVX{Zri<g>|V1CQQ1<E~$D*FiRCvbqkK?3&@xQ{>)
z0G7+vomh(Y7PdIw$(GWzgr()4LQ62^mkB&Z;3EJ7e6`F>TFKy0i2!7axsiBM8Kk5k
z0#O2M0A}255v#;0!A^kH{Un7>5r9C}qKpwx2qX!N6PN%9Rw}<jVH(_Q<*4*f=vM)<
zs{8~-tfSE^H4O{#xBfWg`UC+?Ocv#n1U^N8mVL@^5qOTk6oKafl0*s?tlCtQ%X+pN
z(|Sf*xiOhccCGo+%{23G(u4jZ?93J{-!Az&&L&=&Ox-PKd=E`YH?o@Z@s(54^n)o0
z;Q5cvNlpBhk(!7*QDZ!CL29CflbV*!iSgu%QWGte^c-3YMYZZs^Ynuvj|d?7yX!Y+
zDz|*id3IvD_6xgSbxswhD!0sg>r&u+9x*ahvtX96&LsBgydD%R5z?j~5i#PM^q5En
z-!$*7NlC5KThiXvIZ6G^O082*^VoSkC~hNddW(pqb7CSHncltXu=utnVWwyK>vd)K
zb=b|VO@D+8AMw6uS#Jpy-D3^e?y;Wq=CEYehrEI%b1cG=v3)Fs+n$EJnA~7h`?y4w
zS{iF)&Z;H!iSzft{E>yHK24ZTo?3l^)mnx%TgWV}Mw`*aZ29!krYz9H(jda6h6&iT
zl+FSzn^Ukqw2OmTZ<QSL@%ICRroUa4mZtxO++VOnnv5Kp77wP(r;m}HKD!KdmoB+1
z@r_U^(>?v2)(R?HtUpOuFKRU~w%8kTqgTt1{;`2AuDI!aB^U$kd$hMKsIL~3MqgFH
zI&>u%MQpL&7G-)HXd$$^xy`U%${Z&|DZEvs;Th(+g*D}}IZL&+5_XqhOksEFQQ6|M
z72uO#>9jgIKE++YslP3VI{25hOgG2On%kJEVVL$e&*uI%X0nDc!^{y+uy%3-V^_yd
zM8*S5`xM~XC)zs-ML;Y*5}@6INwBaA?JylZ-81Qd!r*u;9xa%E&DqmRb8kQXUnBNy
zr?m~MLfYAi6=OTpsn}zzPqYsenaMh|sZ9GM*-C3nn%GWRUUUR3hdTLMbEIgCjkfye
zEc8IIh-+Ky&{rgb*lURmob(!5ul7kQLzpBPOCazp7xkBHABo1HK~Z*QcP+vq(gw>g
zZ#rUKdDHc5=X#u-J)(lIq26Q=;AfY?Ii@NPf5-p2<IJX)_MUB?YuS{BsH$a4rg7_Q
z73s!1FZ%BMj{n`H9l1ZTuGe~fok5efgf3830otJ6nX&0=DW--RP(r9pd6>Xa0*fsU
zu+IrQ<Svt#XvUMz^?!v*>bySV|4kTvUO6ocg5~+FKVbl4{gsbWXn??H2s}&RV+1}<
zz>M$z9o$F~{m<j^@8Z4Hz&U)=9?7m-N&kVj!eXKborQf^^@mfd)~5hBJ^(o7y~LC@
zy>ITy_>WDMVtZss)k?KCz6Y&;<IhwcPDzJ9n*=UoycE3PJv=An{L*w^q#0rtq{Ar2
z?_>~OP;yo}{DITMEZtPYW(qFJQm!tEJ`l>J^HMXH?b6WHE@HchYFsPAih2pzu6@a^
zYTCy4<+iFhLJmmVoXMhIE^Wgmptw^m2|4e?l5_|tdkC`Tkn?cSA-oNfycOqcZxb@g
z+fv^GWYfe%=<KjL>nU1PFyCOT4W&pZk#~V97Nx-6{=!Oq4@?33JRuL4_F{WcRpQ@5
z-fa*W85XLTFAS|=x=024%_Y45TVF02%9yvHV7`~lX^;gAscWrVC#0^W24g+Fx6zCF
z7y3;sA2comEI&!YvE6FybB8{y8mQ%t(P}Y9QK2?QA@$YAES9ir%!1ixW40n^%nC{G
zhjh#~7#4M=OH7S3Bwlc+kTEM-bVXxUl){*8SQhJqsAAciLM(C1H8mC33TQpBSsk?{
z#6M%yHnLHBV8x?$6&tlpM}NUbt=ydWF{faOKfyLzP%n(LK)f>K3RN;%_A`McV2eXu
zEj9#+w4ra!DO=0Y&uuJ{TaW&aP~}FOwguNv>b=r7UGO1k)ar<Nos}BJlG{|#nNMz)
zJ3>BH@KLn_Dp;k}rTx{ZAn63-FkOzj;+6ZYhwfM*Mbs!{K1Hm~p@`Mmb3ddMvGb-=
zMA4!eb{C}66s15BJD0`k4<bdZW)$)Bw<)4Ng2z4wj;>yv8@gz2xPPVm+-f#61b@+I
z1_Nii=EFTlf|9?;DY?*kJBP6X4UqmjClp*edABZF)Vvg#`(p~g`WVLFgQF8e(V@Uo
zC!%qZLMCAX4$FG(PNxu6`5$1?U@n!IzFav&QH*FV<jPi(MR_cmWNwpS&>V)nbiP7-
zm~xXVryxw>x9<oVY}2=MV|TLWK0&!+1jssE=>y1C8*4|^;dmlEG^W67i7IUyp%6Le
zP)H{QLF6A(m_V>dw}4ll0`qpBhCo0~+XBme8#jlKq7>QDZ<#b2*WbZy;_t{-<h;xA
z#Q2Ga-~q5GmNb4(EXfQ6=zBKMn|y)Z#BUJOnMmle)~NC_WnNS?cbx7rZTzzoVsgCy
zfDkvW=JLzUMmy`R-1429f^S%_{3$XoYkJ+rBzzi9sh+Q>(c$U~tsW*<Us<-1W1$`P
z(xv_jJ^l>DItH!&P{to!+9=yhX7#qyvSxi|)$W)5XLrA{@wF#k-SK;o8GlNuPp#Sw
zkzC5xk@0q>q&4R%7)X29%t`7mBdtk!VU4Y)AbdeuLoz@$Eu9nN$rn+~*I~>eq9`nC
zMD6D)L@p6P^7pE{$UM4s+W*3il)Lln9tP6B&N;XGn{{`df$b{A&g;Pp<rR5E;DWpJ
zZJWiv`fbV5-Jc0O^76j3PrfqrTIAK(A5^@4Fco?@z4zfis+ifAl2)YxkFYjbo$&@!
zQun#t45Yo?bCUYYNZl!KP(*p)g49iIqGn0w#CY;W6!Wb%W)V>o7B!;wbGt<@5kT^#
z>NUJgs$RGgX5(jD7|6FBhZQzil9TPY8iY0YS(qj3CgwcMjKr%`?(XSRUs^?fR;7L2
z_<OF6Mb5P$I_K7ZX?ORTzH_LBlFn0v2QySu1hou2K|o}B+hOsqecNRT_GTI%c-eV&
z!z=BtwY(bq@`JCJy!qf8Wxw~}j58(Gr5Yb#&5sQP9Vsbz&dNZV+FSi)q+rSm+|pAJ
zz90pu`PH;^PK+mCL@}6t8?uNf3X2+1`#Gz~B?3skRNZ;go4<Ct4*i#MuRS}?fp6`c
zTm8+t*QRJ+07cIm!Wqgfa)|(v#dzsDdpFrf-<@wCjqyU3&}5;n2FX79?uGWzFloM8
zZ1H!!d$rlpw8#4HRkx+(DE^dLRvoq`dqJ80-1S(W{SBKF@Xcb^V_WQRR@(`#astkl
zyB^zapKY=e>~sPSnu@27LOA@si>D!kP7KRT&>*|ELG#4KUl3X|aaV;Z?#dHU=Lop5
z<VFpp#EwF!UBXKc(_vTw!eeA_5i?;vfZW1_xgg`xGG@T+%`spuzv;^>vGiHd@=BD#
z@~V8<<&`c*s!+w8rSPNWyJ-f$RS<E`j8vI2ebtkVjOiPS4o6`87)Xx80cSEc84V<c
z1AW}|u@Hkt61pmN1Q<&u_$H>gG}(`bJ&M_q@v%Nq-wx7_z6Yu7F#<ybh+`U9V7{1j
z!mIB%wt_JY<|)buDfQwOjL?MUSum3EqCtcCDx%p^dJmNaQ@&0a7`w!@n^P2JHj2Rg
zT>YhSR~a>1n3^rDHsoTv$`)k&XH=W~K7fINxv^bkn6}%38UG!>-#Pu@xuKc;uRJ+(
z{}-S9O6OE*%74eDs`{B{G7WdW{LHzQ>EV|S&ODQDxHD6=eX3-BRr6Wj^ud(WF?}#i
zQ-=DRl{%(Qix>}F$Vp4<G2ECwn3ow*)IkP(HQV0JHr$y6)v)}zdvB}#4TqDUzie-p
zltc5#wuq_Z9=$0vkNiFMmMpokB3~`X=^mq6vim*IJ<pPK5B_m)nC_J`x>u1y_l$N{
zH%HJeDA*0tJ)=FExnNVLdp<$;$fE0JZFKN6y5~2gdo`1OM)w9M;KTi-PWB3>c-Kp@
z9~`5;n?`-sx{><_UW@`NuK}|;ZTSX;bQ-68lj8pgAh`&Isb-5OopA71&UVuKFI+C=
z=ldDe{NDgpMqlRLb<_P<N}aTI)rqaE<?7beUkD=e7w*07_BV>11Z&Io2BkqFD%vH8
z%vGeCy0umv*!RK&R&_iM7xg(Vkzsyn*yCjnnc6*ZclRMX8E48qxoR1cL5E?FmqRc~
zPFYHvQXQ!XBjN)0cxl?%5nVeguWVNhIdPS=hPxpLxO!<*B4BEGj9oT2osknQx*~!m
zN`aB9HME53x9A45%jOhT1ah6a$E!rHmm5MQs!qqX0-cV~BxyJUiEtzII8Kni=U~ph
zil!pDiD0}`E=l}t$jwZnhqaa%0uN<)V{?62HQb+A<>sS*spFENjCp$qo%^ym#XdDT
zpiTuIxkYXj#=RvgXb&;1<h4gB+GBL2G1xZQ!AL(7TN3-p&us|6M+jUv(AJ;9Zk*l#
zNhT3q)d|RT7mKjt8PqVdzEm^uJvho*w6U^|05u5n^l$3qe}?YfLtr0){R9pWI7r}L
zfUKRKSAgB$0GFLIuA7N0Pf!&x0!-M+#DrxAHkfVSadIK8ffHsd?<c6(uM_wrflm?0
z6%Hney~s7=<9u)<y)WbD!%8GR5(PIL9~q5?adg2%G~4}<>10K1ZIay$lc?kg0Co(C
zr+&V!gMRg0^Q&6tSGB<&EzmyS-1b&m`zw21-;-|KkCP2L>v8Htwf~AuYOln}290Gn
z*`U$|UkmM)njkrAc+;QpA6~gw@JAmsuL$$r>NAn&Pfz>P-atwsp&x-X$sg6<tkjwE
z26*(mA)KL%B9{mt%M!+Mo9*w1^nv!#kb#_L5R#)b77_#9pZn17|9`ZLMtq-P(p$!4
zP3)qPR<P?q@bm`4NV{l847+H=ga_~CWz2gy$iV{d<-|g<>qxB@(*I&}xtDn_FVFE_
z{$brkBU)6OZkd8!Z$a~(7Nx*=7*}`Zxy6*XECvZ?LF}T*_g)SI4t^QxGT)VD=Di&F
z#J!heuY!NFpU6JnT00aQX6IEw#FXodE7xh4y4%)lawM-?Tc6N3+2(9gAO@clVAsen
z-LORgMrDZ#*}_;LE3rL0tfC(z%w(33RW4$8ZFE=Nkg!TsncCTg1P6Ce#in%Ju_)xG
zZh)QC1HsixltOZ(BxDwt8rgKH{4T=TJI!Z3(G+1c1)U?7G}(8sgXTbbRle=u;I}$o
zd+<!?g{{-$q|5MaJ!d`FF(bXQH@$l6dEeG=b@FWoA8+fhDrZqj`3eD0NxUe%SDtd!
zY<scthsYN+vI_M08?^afluxK(e8R#e$gVE9v8ZLpYI?znc4<R(GfzxV`uXm)zR{Vf
zgaGUy+4LXGcqw?n3vpJ?FHQGFnjv;UI*4NYP6puxC1<6B!lwU&utWH~)U?Ee|95EF
zTd!{DwIrh4A){eZ2LkHr!6O)+`4^q=7b?a&#C7yxeI0TGZV#PW&E~e`Awhm=2++ld
zwqDAd42x1A+NFjsG0!cgeN5*3Km&=qn+R8=#tpLilg&x7m;ek@JGmh}HR27HPj2I6
zn9MZdI6aA-HJHRHKJfn*p!0kbqR0>&4dv73C8_aNI8W25F<+`MF&B|wB~Q#XQ<}dg
zsZbHV%dmDaBS;^or{&4&X$h+@QuqR;ju9SMd9d%s;zUs2>>6{@W0v<WB0j%^>K727
zmAVZ8v8r!O!Dn3M^S^?9Nc=Y|HB6lnF&?;(lLjwyS`OTpuFT7fDC)R9ywO8+eJF-E
z+akxUuuO-yahqr%@@+WxNMQFQoR5-H>|rz`7hju=Yh|8pe-|KWHt1F@7<BAq)Qy+f
zp5+IgG5_Hk8hj^BHmKe5!N<76fna%73NxRd;c(U!4&!4cAQK5^-Qh4vo5&j|V%{)J
zqLk$LDE8=r=!CPy;qXvmFdSBXgIf2C1pYmNKPT{?3H%j-zb9~>K$gHi0%WVhIF%M|
zLkG#IN{JnZchDp%f;Jw<zY!=Quv-HpGGYhUeaiA(``WAaR!7Ix1J+7M+tt<0j=<GD
z)_`O6)xFjVN9WbfN`Tb?M;GGdj^?ZFWsb(51^{@yROKN4iwm8_u4@PI=VlyQr+7x(
zK~J`VkcN3nJpl7DwAn~3f@e)OQM%!zI(S9Jd37h$JUux&0Gdx}$#9LD6}y>>j%aU-
zxOU0bUL0Wfpg8<6U|H_yLw(LC|892_6<JV*8k}94a3f&B@pky9D<(%AiywhF;!ux3
zbWXB)D!{)>lU{1a1_y=`(PX@9JirES4_iP5i<NzNGLtV(`g;wdx|jUta1%}H9bN9o
z_FhkxQDxv2)Y{<5$iZgK0i1&wNB*pnySas{;z%qRh2PtUNzBJOh4~qR;iIm2CyArF
zKbx<Pvb8|U@$ku5l7xgtmp*(d&goq4jnyPCofMh819fAsukL{Ls>R@LwXEuMkA3>F
zm$s$4cVs*}FS*M<yAQ`NmhGDJ?7ZmNc|A!3;!jF<7unt@a_ufJ&Os3(XADJ%>HtM>
zexb?9nBr_pIvH~j&k^FJOLV-Q{^=bL#G}P$-o;NpG^m3_4LDsDI&HZ$N0lUL#m!y)
z8P@S-T2B41#xOEeB@I2h45*s-#v}H{QFY)1N7co1jM!CVcB^i;49Lt5^MsXaEY6O*
z?U{6IA#&2K&?`;WsY%-cl(vm{S(>D_1&$4vi45}aI#{A$3za47cuK)>mfR~AA&~N~
z1pX(1Ep)$xUPVXd8Al#6D?2FbPJsOLK`R(yt1lU5=a056AXgltjhV&>m7hm_KxKc&
zxw165zr5tBIJ0&-dEV1@_Q`p-=X3j?-=C`LJ?~ymDBNzc)_rO4?AlCI&myje)v>Vg
z_jZ11=LOFiF&NgSJZs*mMvWIdZP(wavXrgG-Rnv0XtaF3a*xyYMqlZk8r!#=u02)7
z->Px#=~!f#+*|N4S&oW=>e?Q(h77~RWZ2MzmUpScM$lVmXz@;TnGPEo;OsI#2vk2x
zf(|xTY598{AdeM1K*}bWb2Afak0C;oIf5hP**UE^O}=NcgnY$wf=&Ykt}<-Ml(k0S
z7XsM|86A^#Ct(c1h7_IeYOG(mOm*_n!-j~m6$z#TXq;}?30wonA0Jh0d|)EN8N4`B
zn=eJVDT4{o&&&5LjgrrzRz6Bf)BgV4Q39O)+`&&Dob|6cxBI-i=RJ*+ov97?XFP|*
zFd49>Jcn||$$<5O=a4#3>i5`eZ}gV#sj_{`=Gs$H{H-e2p4QxfLURq9Ps-Uqks){F
zPGDsx5Wbks$ByHqWffH&hE%)mU<I>Kslo8Tk#on5Awy5V1-G;2CJ7KQ$X>Xeg+oy?
zO2~0LOTEUfBg7u%T{x>&%roS7NgWXMRqQg%7=9W5^f_+Czr=jtSLcJ8$vcQn8qs}d
zVr(=v7{R&2<ot#6HG#;Wl1MTot;YC3MnQ%%StrxYZoskY<Va+L<`;2;;q>noJzt-i
zuTRU@H|gfsxPgr#E!J1aK9>7X?_}d%@xHtw?HCGrdp3Xuf=+`FM4xkhjN+VYD$bi|
zvc>8<X@S11gE+Sh{7+w!sGut7;8G4B8(^a8!u0Sg(|4PUuiR{BXn4JX3c1#`dA_cN
ze)Vlk>)JZs*73HdxCMIE5}#|T_=?}+StiR*{FaKgv%6jyczu7S{QfD&rSeuo%)qm$
zPV=Kn&s^Z*?#p=Y7bEhJb<T7DMbG`$lcY>}qpfuJ7TX(JTzee3<4*8#v^>f;_vvvg
zFzk^G<7+X)9>#;gum=+Zji(%Ltng8UF}TDis>JxLG>oFkB}P%DI*NP?Ck^$?64i_W
z+!X|WobTr2MtKv>qWlek?+_qar9#sjSp8NCw-IP3&_SS+zzl(V2#EDXp8Wz}N7NnS
zj|z0<y+#)w5HscjLd+oxIj_H?hmr@P{F#N@A85-2Rk5N@Fi9?GFc|@h<o`1o3bcR`
zL&0c((E4^M9|z37psH~Df?N%R%8)IbXy!bJF4D}x(ArhHyUF%OlWX_7T(lO`$yK~&
z9$Gu)VoW1gDlqFc7lWlHsO~NR)fs{V)wK!3cp!V>sD3LZ3@-MHvQ<7?4U-r<hJ!-j
z3N1Q1lw_L}O>jLI%@?)Q(eOVH^}7fG3&sB$4gd3Ajbp#!0-7nMgln`2WOSTU@myrq
z3ak6(mPO{g{G`xH{x(Nui4pCNw^UP_R;(hHHKievzhYCG#Zdj;jORW<!S4qJzwe^w
zzU$W*s#lfnUTu40wQKk0TvQhn{J$?m^<qNxo+6lPmT(R#ha++ofZ#aS9>$z@ZmcO+
z+#oFnFSP_MuL8$ZWuWC%OVIKvm6lhlW@-imKYsFw?&Mfx5PNS6rQ!^|v3a$T(3UB$
zTPRd00{b)s?h?;dN%!K&%g@=-C@ac^**kLu?-Ddv<oq!q>~C^}b#jC?n12bJecNFt
zdoL&1dwB>+y^J0Q-a9?sm+|ZuNO}NBy8ohQze<n2rMuf~Z?w5~Z^%W`rRXt3QWD%S
zM7?!3#f8)3YT3U8y4JwYeT@NKYnDLQ8Wmk@C%5D?@Z)3aXf>(YSFQ^{8q1<Eu3RAk
z>z0&2WwzCb#1jgPJ1okI@Hr^(d3>yEU7$;UJ;d;SJOMAbES7W4#sVzYXml?%Mj^<q
z7AS?0M)J$c;yg2XOgC;>Xz+Mbsq~{6&tqZ$d;|mFv5TI^t}7K7n5tr<OnHkM|N8`f
zK;VZ2enjAZ5%>v#>ja9a5Lt98UIIP>eggl@?h>G}!}SvXLLmYX<X~GUNiVVbs@>)A
zU9BwxfWiTw72tr?54gS=V2`!Y;rr?KCWnvLMKuiKzqn8vyRL1=pBvRhD5WE=U}@IF
zx^`$JdUph(@7Ox@++;n&{p1A1iUDkm4@7Z0Fi7`v^$ubxOW}>kU3uQF0=O<0OZdeW
zAKXoO!F~sCTUzLL`M2ECf^VjLDmH#15Is3IewqlLCaCC{bg{HTB?^UTUV}#K2-34C
zVu55hi{UeLV~W7@0H%ssF1Tjzr%C0SbMTxCt&JIaiDZ2R6+#?W;hS=D|ElZWbY}by
zENy3<%N+7E@xY1RVsi*pG=>+pScu=G__e$65qYa8ZMH7Pd~E|`dF$qD;`-U83D4p=
zdw@)~Q&&@wMYdBHLyYKu(3*HIB(eP+Us|o!UeI>x8g}e*?a`+Vc2#vttBy<@cT4Ic
zE#`ZLbnltkyA|ZEO6-<_T~(bfhFrv7nlN*g>s7ldm)syXhFt32n)m88auw{Vrbkxk
z+X{nCldX*YvvEI)gJePb!S_tS0e5`xbb#%L=uy6nw#k>atfjPd1oRCf%mqy~rLBm+
z`CBS63;;PjRdJd6LJ1J)CQwhHiogbdm<5Q>A1VwyvJQIdWWMPZ+o6c9*trVbd_wao
z)!d9(&2;lLjW@QwRRLe-@|}&W!HDRnTS2m1W2Q~8obcplqH&un!sC|TBC`ASjp@b%
z3pu8nwnYTzR94i?Y|hm5y}bEsWV-XEl9|ovn!ZeV|CA$JyDKH_O4aU4dw0!A>Tg!s
zH8mk(Ja8c=Ev?6J1I6=lBZ^Ai#SX^Wes(y@8{V?L4U6n^ycd*i5gG?ovbw{OD4oY-
z30+nIN8)AYliB$$N4Yv@3B7U}=U;}#)!Z<Rt7SB<E{Dd|e*kG*{R(JYgP?JZ@~WFg
z<C=)Zg^J~7>WjD==L}v7dDyHX_MjHDYp|C2lM5XEdwo_}_S?i>z-4oa4G40J+6rZI
ztK1eUQz^Z71-k}?uVGDRj9rvyg}56n@f#ye25R3v$rg9q;Eu=^?G~g3E!WD9C7y~Z
z<UNCSVl!771(W{Ev^=;xM3nTs(0_;04#IeuX#HiHXD=V78as$K3tG=f^*87SdtMZw
zfy+O~rHoMG2?8+!j}rJMK+c%}X1wN-Q$XH6Lbk!s>`VaFLa2G1EQHvkfYya`w(&4)
z*l=PP_HEHIzN7G-+(}~b?1MC2JW$YdvFG)>|6fiQ-SDK9s_izq(wddJX}ZvTYatA0
zScWus($Zq&C95_kD=&#G^WLg6N1i`TXFK1`fboFmS)9lWcOT98!CSr-q{Er-7H2Yx
z0CJFD4}I?07m#wc;}ip57=OKu%{|8u<rhEu1E+=*C0as7jH2{wyda|}=qP?2v1=jy
zFYNzqAsg)f?MEvO6h$v%s<;kWwJ~k5JTcE%q~(+yDn+r=39TW!N>LoTHJ_m_8n$^C
zJdTUq09u`(DCA?;3X0NaXc6ws8G9OTw+L@U?lcNG;{I^A$cPm5OfnN`?9#w-KR6#I
z0QW+4kPmItU^~ayVLw=HXlMN}i#G^K*>ZvbM)FN)`hb!`b8A_f5LV`R0t-`F*elTd
zZ}7I1q$pF*PVoC|QScn4K1<-^1U^RKlK_k}<s3g}`1u|Te23V35st+{&l553FDZ|n
zn`L~CDEU=Xjtv;zhUtB2Z(B<0Kl>{Tq`m!flKPvK`cvLE9zAaeXDGMGB?3sk^g*r<
zpTK0BIX0?f&&kN_OCY#QvG~kVp@f)yrswi7w%;a~Gv>d-gzc{^fbEA%6L{;f!`cT9
zJLI~;*?ygf%JsSXzBN**{f(1tR0A0P1ujsSP(u)2a}z!+`ssLzRY#2uP?~hZK64?3
zCXBfgjmr)t(4RtsBJW!CoIJf0F=Ba_u@r<FO~;nYEkq!4vpO-A$N{;<;4p+VBJ48M
zWx6{%B=!Jnt&EAO6_e6)28&}l2%}USR)g<Od{b^CVjxR7Ua?B$c1ndv3SUV_V8Fwl
zaQhK=7gS@epRJ{9^b}NM*$UpVjClzwO8iQ`lbsb|*td0dMDDoh7$%lP8mq*_X3OD>
zC<P4bFkpo_KE5v)279Rzcs~2p_te&NhPLj!>1{0*l`Cp%Q3`F{`2lHbFY~VXYwxM8
z|HaVOT{pe0#Zf>jYHLvnZQb?3XzP(yRX|WGoU7-Af~z&BL!=tg&ZiCtql$p)ezIUu
zfkCFrL<P+8TmY1qdoXXJ#eM@tTL=_j?m^<?oFyFb70jc;8Uzd;6-soE3aYn+Qq^06
zm3d3x3VQAyL77F_g?<t4GSMt)Z4wq{Aeu`~6XytjN=zV4#>_cF6OOt*x8ZEn?>_Z4
zr|Jfw!bj&w`jCS=C1o;oA^afEquepkLSeU`%8(eHGHqZIh$M5|NAZn39$aw1;$Kn$
zTQWLs;FXv3Od9kqBTw(69%StFXXuTl2$1%UxtdV=>F(Dlx)GryyjJFj*THjhhMS8~
z7H?ZqG0NiEe}me624H!qJ9o@9FI+THX!q-V>BfCmitXgANh0GN7a8wVkR;y6jCWQS
zZoFgW!yS_N>V}v4(@k4mb!M8jWE!?)s<uv*WE<fn`xdx2g#%Rf#F<l>hJ7!eI@j{r
z#;@1BzWeKqUp_d!n|WD1m2TLVsoFnPLjG58R<5_;v9)sg0G;N3ApbOXh;ZmMck#P`
zW8L*M9O16TaAW#_$R+~FK@MSWY4uHKi>CLdq_*k(SiH?i>TgzRqj%F25WbL;meymq
zF}*)8Goq;D-K!o;<9;g~&hDfC$#x8lzgpuS*d_hdwi3X<_PHOhOMl%~LU6nLffDJz
zx=R4f&y~LyimN4@VK|mN8j?17GE>}TRz^<j8zg~Q-leTWaH71V-7??d7b-Se{~RHz
zkauZyYMg&uI{K+RW%2jmgoBL0*=4R`g%gev=7i%@2JWErnBXgq5E0G0Oq9j?Q_R;X
zLXn%p9q6S<qnUSMT_s9^JAiHHO@Axlwqr$aB}(C~U~ejvHrPF2P}-1zCc6whK+>4J
z3tuVLSXxhF4Obp72}w8>$$sG5q_jCkQ$M>x?qjW5STpNO{Db!Xa3;DI>pD6?ekDDK
zDb?_nragEpAfAd@iZ1P$?40{$dU9cDv>ohhSM-+PftGvAkXR>)w*&qfN}1!CD22Cz
zBxFTzRY=?EJ$55)x1coMvTE6H5~~=wT6IfWdhEU8q(}Bc#^XNH!7;!HHELw=`&V+T
zOUn@wPHl8tWq$w2J?zcK)K2ZG+ISMUfZkuO-Ht6Njn=MRu9W_L%jVSkrA|Ep@IChn
z<UPL=a`WTD5<id|m~4j~9+vn=Rn_}*tqquBMjx%FQZUR3Ud#!NM}Mr(3Cn($SjjD$
zQ*VJ)D(x(lo8)HUE4+LKcv`OYvG$i}CloVu6zu;&n*^vWM4Rx~hrLZ&a@vH)KJ0DM
zn$spc_F-=mCL<POhsQqDZL;5f=sT)>q&(O@$y`Embsmni8=Dv>!x851B5<$#z(4@r
zYPe%I_+N=WJr*4#uWbCVwL)Asv2B;!%HwemG35yY0|Z71oFwoNz<_c;1rHH8PT*Yv
zhXHbAK!o5-2DAqOgA52Fr?}O)(@Ix{Yy71u8-iAXv;rd*CL6L2K;Ot7N^Un0ehGiF
z)+ZK<h?wWlJar&%u&~-1+eZ_F@YlVwXZuJL-m1owo#%@nogik7Nrn7(eQo=>$QLHh
zwtQ8ds|luRf?wONN{3c8=R3*dTU|XUYij<|MmIVL&`i&E!v?Did%4L}L3xV6(*#Zv
zm?ZEFfhP%^BCw=Lh&K%Dh%Zy-uK)xq3}Pahj=4L|U!%}R39!EXG=*4KGg;ARDf%%2
ztZSd8&~pSnPT-RO>?ju16jT4hMMjm(@+POPvjG!<T~3Z~Qq!M<$na*k#jC2H^R=ga
z?HOO^lmr1wO~V%-|NP@;YyYV6D~)q4{ppteOv|PWxk!Q0czHv<2;><d%-{hMf#_)@
z0?}hU$3@6OhD9mlFvLXitkjNo@wZIfbIG@AI{MXzGQKq__nM2oHFNHqU)sIq3)@6m
zZj2{ibg!9?8gm$8BF9%BGUm`?7u+~AFz0c|g=c)%f7dIU{$ks!+x~2O#(!7Jeb*QI
zQ~tX$J_=s&-8JXV`K9T;NHfGPxbH$SekX(Qf|9fDyXXM5niqDa+@aJR_oX}TfAf)a
zM~Gq_p|meF=T?8S?$DW?Ja%3WW>`LvNd%C7E9-4Y`#MtY$5J~UNCyw69(*(%e2k*O
z$I`yX=G^LU7AZK7kwwoN!WouRWD^0T(>veFx@%w9m2y9l+ID}s^U#|o(w&b`tn-ny
z?~ysT`kQq>a%LBgo!5gImQQ37fs5`(u6lvRe4i9aod$w#<t-u%-zV?`0zV}1BLWRp
zfEEI42y_$JOkg{Kf2J%{hhYa-9o-;6`=XcwW7@&he%0=EG+u3OaolCSy3g7Q*K!A}
zcE`r6?sbm#pFV1BavZhtN~q`{suUOE?7BvJh{CpRDbh%BP|DU3)w)|Z#NUs3LV;_%
zL_Bs0FE_c1?yC;tl9*~^!*Dk;I(nMz;38W>&48bsVGy8ch8GWvCPrd|xjVZcDCDLX
zdr`HiDF$=hb@O+W$k@xd+#y$aN^_@B?4$E=NM8-j^HAYKwWF%=g}5qCyGm+r*y6g5
z2@30TEUp_qfEL#c1`(_2@>Uls%8f#Jw@Pj@2=BlY@yk$`sUXx;c-M@@=k!Rk!Ol3K
z+I|@^aZE{!;GpFsaG1MQ7e>Z(Sl_qt{6cn^1SlW9?#fk!%C}G~YwhDZO&s|uHl}xE
z)jv$tzea#<3i$?wh6qFf2KcF00gP?t<wEB{G{u$SI1$4)Gt=L2Uv~4NBb)CQVTQm9
z%l#9PB-)R<y!uL2Ro$d>MHV|v3k<~)J4aThq}8d~)oCxz!&QH?(rPmF)e{iDkdv0y
zW4M9hdASiqC4A?|H7EMwjf%3ps}|WgvM4QM4sdS%&Ji*tdyllNb{Se$cf+)-p3$;~
z99q`+0i<P(2KuucEz^W)E+$O7Sv%wcjFtrqw5(-iw5-2yT2@Cih3##4PXw)BBWP6L
zWf~WknSH@#QT;6>X#F=x(E4pyaTQF^`fuVvNjFB&)~2MjsoJ$^@7g&@{mn{iiJ<8T
z2w%uaOY1S*K=HiXh@uis(3HKzJ)%*>CD^j}dlTHPCS};1RY#;QE@9)qbf|=BarG0F
zA!#s>0heeG5x`c8dUld%gYhI$Lm6thyvwLvT{frC-Ux=NR3?T=KiyO4rww%pErb4O
zJXPr03=(@VOmS?d7@NWOPlbx_M1d8aARS=e-*WUhgT_MAo@=I2H-5j|dQ1@4BSIUc
zm9|(+^xb;h=4_#`z|{0Ed7Gondqk9i&Iy~}ROZwpx62*k_`I@Xg}1fP$>g^6*r#)t
z)6U6*Z1u!8CJ?)&ZGwx8J{5tt##nrCbYcjy%jhYbIuRRYo8y#dWbg#ij-L(~n6rXu
z{bj=U%Y6Xvg?UwWgGMlFuLLQ?==um%dV)ZV0PT4jP{I_95MV4Utx9zk7OUo6ln^Ii
zCqOn}23cS3R*5{(-?HXk`d+%{sM==<e4N0?2z-*j6ag{`TipDM@3&ywaP;X29Inz{
zI<}h4G6Cz|`XX0f92;2Zyh2?x-3*}7J}CV2kIMav?7MzT+w0aejRW!kp8x2a)Wm<Y
zQWIGkQ4B}nXu%BS5_v=b$@AWdlvIBv^gQH-_53$0)lVrrcwP^R+ekYT60vknOe7=I
zyH|_FCc26;Y@%b*lmYDEe~UWz`viVK;D-c$L|_k^Um;)13T<aJ6~XMH8w5zoktA9}
zBAA2Z+&>7hORHnuPY+nz9Q>3JmN8hLm5xuuM#p3EB*WBfU6^}#?};RnQH5kCJxY|h
z&CZsHB=}C{X&mkUmEMmDz_%cjmB!&MguInzixSCf@mOU1#AxieLbG$$KN1}e4^PlW
zpr@iZIvpqqHz`RzkZ71^i-%)Rk54F3h2}1WD5BCsfYcFu<n5-=zYt(DC-&9F6fGqH
zSq5HVJ3nz@r}E2a4*V=hT9RI|S*_N8vRHoI@(IuPET!z{{hp=dA1t;1WZ49va*_00
z)5n_LE-JECU9(w=%CAu9ic6$7Se<G*LRYqnbe~8^=*l{gUM<oQy1L$BZG5NvDVw$N
sZ41G7#)}WwtiHD`1mAhsd9uiQ)cUrC{dwo#IpS7p!&T^8th~eiFMcK4e*gdg


From 8c615ef3c5292395731587d63658173b6da08b58 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 11:02:04 -0500
Subject: [PATCH 056/149] removing plottingh

---
 .github/workflows/collect-results.yml | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml
index 14c499c0d..1afe9f049 100644
--- a/.github/workflows/collect-results.yml
+++ b/.github/workflows/collect-results.yml
@@ -35,16 +35,3 @@ jobs:
         with:
           name: results_${{ inputs.exp-name }}
           path: agg_${{ inputs.exp-name }}.json
-
-      - name: Plot performance
-        run: |
-          pip install -q matplotlib
-          python3 utils/plot_perf.py results/ ${{ inputs.exp-name }}
-
-      - name: Upload performance graphs
-        uses: actions/upload-artifact@v4
-        with:
-          name: graphs_${{ inputs.exp-name }}
-          path: |
-            tput_vs_intvty_*_${{ inputs.exp-name }}.png
-            tput_vs_e2el_*_${{ inputs.exp-name }}.png

From 7b2acaac58bcec2c958ec44879c3fb5135415816 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 13:40:06 -0500
Subject: [PATCH 057/149] removing plottingh

---
 .github/workflows/1k8k-sweep.yml             |  12 +-
 .github/workflows/8k1k-sweep.yml             |  12 +-
 .github/workflows/benchmark-tmpl.yml         |  14 +-
 .github/workflows/test.yml                   |   4 +-
 utils/matrix-logic/generate_sweep_configs.py | 132 +++++++++++++++++--
 5 files changed, 142 insertions(+), 32 deletions(-)

diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index cced99997..68fbac028 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -70,8 +70,8 @@ jobs:
             framework: ${{ matrix.config.framework }}
             precision: ${{ matrix.config.precision }}
             tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
     benchmark-dsr1:
@@ -94,8 +94,8 @@ jobs:
             framework: ${{ matrix.config.framework }}
             precision: ${{ matrix.config.precision }}
             tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
     benchmark-gptoss:
@@ -118,8 +118,8 @@ jobs:
             framework: ${{ matrix.config.framework }}
             precision: ${{ matrix.config.precision }}
             tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
     collect-70b-results:
diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml
index 58c676b56..7be91c4fb 100644
--- a/.github/workflows/8k1k-sweep.yml
+++ b/.github/workflows/8k1k-sweep.yml
@@ -70,8 +70,8 @@ jobs:
             framework: ${{ matrix.config.framework }}
             precision: ${{ matrix.config.precision }}
             tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
     benchmark-dsr1:
@@ -94,8 +94,8 @@ jobs:
             framework: ${{ matrix.config.framework }}
             precision: ${{ matrix.config.precision }}
             tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
     benchmark-gptoss:
@@ -118,8 +118,8 @@ jobs:
             framework: ${{ matrix.config.framework }}
             precision: ${{ matrix.config.precision }}
             tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
     collect-70b-results:
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 4f8468a82..4fb327381 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -26,27 +26,25 @@ on:
       osl:
         required: true
         type: string
-      random-range-ratio:
-        required: false
-        type: string
-        default: '0.2'
       tp:
         required: true
         type: string
       ep:
-        required: false
+        required: true
         type: string
-        default: '1'
       dp-attn:
-        required: false
+        required: true
         type: boolean
-        default: false
       max-model-len:
         required: true
         type: string
       conc:
         required: true
         type: string
+      random-range-ratio:
+        required: false
+        type: string
+        default: '0.2'
 
 env:
   HF_TOKEN: ${{ secrets.HF_TOKEN }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index e56fc9a82..3d4fd2c5f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -46,8 +46,8 @@ jobs:
             framework: ${{ matrix.config.framework }}
             precision: ${{ matrix.config.precision }}
             tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
     calc-success-rate:
diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 1c3472eb8..7ae81789e 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -151,8 +151,8 @@ def generate_full_sweep(args, all_config_data):
                     'conc': conc,
                     'model-code': model_code,
                     'max-model-len': isl + osl,
-                    'ep': 1, # Default
-                    'dp-attn': False, # Default
+                    'ep': 1,  # Default
+                    'dp-attn': False,  # Default
                 }
 
                 # Add optional fields if they exist
@@ -177,10 +177,26 @@ def generate_test_config(args, all_config_data):
 
     Assumes all_config_data has been validated by validate_config_structure().
     """
+    try:
+        with open(args.runner_config, 'r') as f:
+            runner_config = yaml.safe_load(f)
+    except FileNotFoundError as e:
+        raise ValueError(
+            f"Runner config file '{args.runner_config}' does not exist.")
+
     # Extract model code from config key
     model_code = args.key.split('-')[0]
 
-    val = all_config_data[args.key]
+    val = all_config_data.get(args.key)
+
+    if not val:
+        raise ValueError(
+            f"Specified key '{args.key}' does not exist in config files.")
+
+    runner_nodes = runner_config.get(val['runner'], [])
+    if args.runner_node not in runner_nodes:
+        raise ValueError(
+            f"Runner node '{args.runner_node}' is not compatible with config '{args.key}' which runs on runner type '{val['runner']}'. Available runner nodes for this config are '{', '.join(runner_nodes)}'.")
 
     seq_len_configs = val['seq-len-configs']
     image = val['image']
@@ -282,7 +298,8 @@ def generate_runner_model_sweep_config(args, all_config_data):
         with open(args.runner_config, 'r') as f:
             runner_config = yaml.safe_load(f)
     except FileNotFoundError as e:
-        raise ValueError(f"Runner config file '{args.runner_config}' does not exist.")
+        raise ValueError(
+            f"Runner config file '{args.runner_config}' does not exist.")
 
     runner_nodes = runner_config.get(args.runner_type)
 
@@ -344,6 +361,46 @@ def generate_runner_model_sweep_config(args, all_config_data):
     return matrix_values
 
 
+def generate_custom_test(args):
+    """Generate single 1k1k job for custom inputs.
+    """
+    try:
+        with open(args.runner_config, 'r') as f:
+            runner_config = yaml.safe_load(f)
+    except FileNotFoundError as e:
+        raise ValueError(
+            f"Runner config file '{args.runner_config}' does not exist.")
+    
+    found_runner_label = False
+    for runner_type, runner_nodes in runner_config.items():
+        if args.runner_label == runner_type or args.runner_label in runner_nodes:
+            found_runner_label = True
+    
+    if not found_runner_label:
+        raise ValueError(f"Unable to find specified runner label '{args.runner_label}'.")
+
+    if not runner_nodes:
+        raise ValueError(
+            f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.")
+
+    return [
+        {
+            'image': args.image,
+            'model': args.model,
+            'precision': args.precision,
+            'framework': args.framework,
+            'runner': args.runner_label,
+            # Again, just use 1k1k since this is just meant to smoke test all runners
+            'isl': 1024,
+            'osl': 1024,
+            'tp': 8,
+            'conc': 4,
+            'model-code': args.model,
+            'max-model-len': 2048,
+        }
+    ]
+
+
 def generate_runner_sweep_config(args, all_config_data):
     """Generate runner sweep configurations.
 
@@ -353,8 +410,8 @@ def generate_runner_sweep_config(args, all_config_data):
         with open(args.runner_config, 'r') as f:
             runner_config = yaml.safe_load(f)
     except FileNotFoundError as e:
-        raise ValueError(f"Runner config file '{args.runner_config}' does not exist.")
-
+        raise ValueError(
+            f"Runner config file '{args.runner_config}' does not exist.")
 
     matrix_values = []
     for key, val in all_config_data.items():
@@ -369,7 +426,7 @@ def generate_runner_sweep_config(args, all_config_data):
         # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name
         # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME
         model_code = key.split('-')[0]
-        
+
         runner_nodes = runner_config.get(val['runner'])
         if not runner_nodes:
             raise ValueError(
@@ -510,7 +567,12 @@ def main():
         'test-config',
         parents=[parent_parser],
         add_help=False,
-        help='Generate test configurations for a specific key'
+        help='Given a config key, run that configuration as specified. Optionally specify --test-mode to only run one parallelism-concurrency pair for the config.'
+    )
+    test_config_parser.add_argument(
+        '--runner-config',
+        required=True,
+        help='Configuration file holding runner information'
     )
     test_config_parser.add_argument(
         '--key',
@@ -551,7 +613,7 @@ def main():
         'runner-model-sweep',
         parents=[parent_parser],
         add_help=False,
-        help='Sweep across all runner nodes and all compatible models for a given runner'
+        help='Given a runner type, find all configurations matching the type, and run that configuration on all individual runner nodes for the specified runner type. This is meant to validate that all runner nodes work on all configurations for a runner type. For instance, to validate that all configs that specify an h200 runner successfully run across all h200 runner nodes.'
     )
     test_config_parser.add_argument(
         '--runner-type',
@@ -574,7 +636,7 @@ def main():
         'runner-sweep',
         parents=[parent_parser],
         add_help=False,
-        help='For a given model, run configurations on all compatible runners'
+        help='Given a model (and optionally a precision and framework), find all configurations matching the inputs, and run those configurations across all compatible runner nodes. This is meant to validate all runner nodes that should run a particular model can. For instance, this should be used to validate that all runners nodes that should run gptoss-120b actually do so successfully.'
     )
     test_config_parser.add_argument(
         '--model-prefix',
@@ -602,6 +664,54 @@ def main():
         help='Show this help message and exit'
     )
 
+    # Subcommand: custom
+    test_config_parser = subparsers.add_parser(
+        'custom',
+        parents=[parent_parser],
+        add_help=False,
+        help='Enter custom values'
+    )
+    test_config_parser.add_argument(
+        '--runner-label',
+        required=True,
+        help='Label associated with runner on which to launch the corresponding job (e.g., h200, h200-nv_1, etc.)'
+    )
+    test_config_parser.add_argument(
+        '--image',
+        required=True,
+        help='Image to run the benchmark (e.g., openai/gpt-oss-120b)'
+    )
+    test_config_parser.add_argument(
+        '--model',
+        required=True,
+        help='Model to run (e.g., vllm/vllm-openai:latest)'
+    )
+    test_config_parser.add_argument(
+        '--framework',
+        required=True,
+        help='Framework to run on (e.g., vllm, trt, sglang)'
+    )
+    test_config_parser.add_argument(
+        '--precision',
+        required=True,
+        help='Precision to run (e.g., fp4, fp8)'
+    )
+    test_config_parser.add_argument(
+        '--exp-name',
+        required=True,
+        help='Experiment name (e.g., 70b_test)'
+    )
+    test_config_parser.add_argument(
+        '--runner-config',
+        required=True,
+        help='Configuration file holding runner information'
+    )
+    test_config_parser.add_argument(
+        '-h', '--help',
+        action='help',
+        help='Show this help message and exit'
+    )
+
     args = parser.parse_args()
 
     # Load and validate configuration files
@@ -619,6 +729,8 @@ def main():
     elif args.command == 'runner-sweep':
         matrix_values = generate_runner_sweep_config(
             args, all_config_data)
+    elif args.command == 'custom':
+        matrix_values = generate_custom_test(args)
     else:
         parser.error(f"Unknown command: {args.command}")
 

From ad18b5112b873d45cc893cef786c10fa1b7ea0f9 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 13:43:08 -0500
Subject: [PATCH 058/149] removing plottingh

---
 utils/matrix-logic/generate_sweep_configs.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 7ae81789e..fd49b7b55 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -394,6 +394,8 @@ def generate_custom_test(args):
             'isl': 1024,
             'osl': 1024,
             'tp': 8,
+            'ep': 1,
+            'dp-attn': False,
             'conc': 4,
             'model-code': args.model,
             'max-model-len': 2048,

From 165bde31694fc8a027bb06a31db02b91b744fc11 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 13:45:00 -0500
Subject: [PATCH 059/149] removing plottingh

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 3d4fd2c5f..9536d6db1 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -36,7 +36,7 @@ jobs:
                 config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
         secrets: inherit
         with:
-            exp-name: ${{ matrix.config.model-code }}_test_${{ matrix.config.isl }}_${{ matrix.config.osl }}
+            exp-name: ${{ matrix.config.exp-name }}
             isl: ${{ matrix.config.isl }}
             osl: ${{ matrix.config.osl }}
             max-model-len: ${{ matrix.config.max-model-len }}

From 52153c7edb600b6fa328832e7f2dbaf6b6233077 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 13:46:17 -0500
Subject: [PATCH 060/149] removing plottingh

---
 utils/matrix-logic/generate_sweep_configs.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index fd49b7b55..8db70fa8b 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -398,6 +398,7 @@ def generate_custom_test(args):
             'dp-attn': False,
             'conc': 4,
             'model-code': args.model,
+            'exp-name': args.exp_name,
             'max-model-len': 2048,
         }
     ]

From db05e34556554251623978077e1d141f519df52b Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 13:49:46 -0500
Subject: [PATCH 061/149] removing plotting python script

---
 utils/plot_perf.py | 197 ---------------------------------------------
 1 file changed, 197 deletions(-)
 delete mode 100644 utils/plot_perf.py

diff --git a/utils/plot_perf.py b/utils/plot_perf.py
deleted file mode 100644
index 1cab81cdc..000000000
--- a/utils/plot_perf.py
+++ /dev/null
@@ -1,197 +0,0 @@
-import sys
-import json
-from pathlib import Path
-import matplotlib.pyplot as plt
-
-
-results_dir = Path(sys.argv[1])
-exp_name = sys.argv[2]
-hw_color = {
-    'h100': 'lightgreen',
-    'h200': 'green',           # H200 VLLM
-    'h200-trt': 'darkgreen',   # H200 TRT-LLM
-    'b200': 'black',            # B200 VLLM
-    'b200-trt': 'gray',      # B200 TRT-LLM
-    'mi300x': 'pink',
-    'mi325x': 'red',
-    'mi355x': 'purple',
-    'gb200': 'orange',          # GB200 TRT-LLM and SGlang
-}
-
-results = []
-for result_path in results_dir.rglob(f'*.json'):
-    with open(result_path) as f:
-        result = json.load(f)
-    results.append(result)
-
-
-def plot_tput_vs_e2el(precision_filter=None):
-    fig, ax = plt.subplots()
-    
-    # Filter results by precision if specified
-    filtered_results = results
-    if precision_filter is not None:
-        filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter]
-
-    for hw_label, color in hw_color.items():
-        # Separate fp8 and fp4 results for this hardware
-        fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
-        fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']
-        
-        # Plot fp8 results with circles
-        if fp8_results:
-            xs_fp8 = [r['median_e2el'] for r in fp8_results]
-            ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
-            ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)
-        
-        # Plot fp4 results with squares
-        if fp4_results:
-            xs_fp4 = [r['median_e2el'] for r in fp4_results]
-            ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
-            ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)
-
-    for result in filtered_results:
-        x, y = result['median_e2el'], result['tput_per_gpu']
-        ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8)
-
-    ax.set_xlabel('End-to-end Latency (s)')
-    ax.set_ylabel('Throughput per GPU (tok/s)')
-    ax.legend(title='GPU Type')
-    fig.tight_layout()
-
-    precision_suffix = f"_{precision_filter}" if precision_filter else ""
-    fig.savefig(f'tput_vs_e2el_{exp_name}{precision_suffix}.png', bbox_inches='tight')
-    plt.close(fig)
-
-
-def plot_tput_vs_intvty(precision_filter=None):
-    fig, ax = plt.subplots()
-    
-    # Filter results by precision if specified
-    filtered_results = results
-    if precision_filter is not None:
-        filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter]
-
-    for hw_label, color in hw_color.items():
-        # Separate fp8 and fp4 results for this hardware
-        fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
-        fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']
-        
-        # Plot fp8 results with circles
-        if fp8_results:
-            xs_fp8 = [r['median_intvty'] for r in fp8_results]
-            ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
-            ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)
-        
-        # Plot fp4 results with squares
-        if fp4_results:
-            xs_fp4 = [r['median_intvty'] for r in fp4_results]
-            ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
-            ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)
-
-    for result in filtered_results:
-        x, y = result['median_intvty'], result['tput_per_gpu']
-        ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8)
-
-    ax.set_xlabel('Interactivity (tok/s/user)')
-    ax.set_ylabel('Throughput per GPU (tok/s)')
-    ax.legend(title='GPU Type')
-    fig.tight_layout()
-
-    precision_suffix = f"_{precision_filter}" if precision_filter else ""
-    fig.savefig(f'tput_vs_intvty_{exp_name}{precision_suffix}.png', bbox_inches='tight')
-    plt.close(fig)
-
-
-def plot_tput_vs_e2el_for_model(model_results, model_name):
-    fig, ax = plt.subplots()
-    
-    for hw_label, color in hw_color.items():
-        # Separate fp8 and fp4 results for this hardware
-        fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
-        fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']
-        
-        # Plot fp8 results with circles
-        if fp8_results:
-            xs_fp8 = [r['median_e2el'] for r in fp8_results]
-            ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
-            ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)
-        
-        # Plot fp4 results with squares
-        if fp4_results:
-            xs_fp4 = [r['median_e2el'] for r in fp4_results]
-            ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
-            ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)
-
-    for result in model_results:
-        x, y = result['median_e2el'], result['tput_per_gpu']
-        ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8)
-
-    ax.set_xlabel('End-to-end Latency (s)')
-    ax.set_ylabel('Throughput per GPU (tok/s)')
-    ax.legend(title='Hardware + Framework')
-    ax.set_title(f'{model_name} - All Frameworks')
-    fig.tight_layout()
-
-    # Extract model identifier from model name
-    model_id = model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name
-    fig.savefig(f'tput_vs_e2el_{model_id}_{exp_name}.png', bbox_inches='tight')
-    plt.close(fig)
-
-
-def plot_tput_vs_intvty_for_model(model_results, model_name):
-    fig, ax = plt.subplots()
-    
-    for hw_label, color in hw_color.items():
-        # Separate fp8 and fp4 results for this hardware
-        fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
-        fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']
-        
-        # Plot fp8 results with circles
-        if fp8_results:
-            xs_fp8 = [r['median_intvty'] for r in fp8_results]
-            ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
-            ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)
-        
-        # Plot fp4 results with squares
-        if fp4_results:
-            xs_fp4 = [r['median_intvty'] for r in fp4_results]
-            ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
-            ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)
-
-    for result in model_results:
-        x, y = result['median_intvty'], result['tput_per_gpu']
-        ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8)
-
-    ax.set_xlabel('Interactivity (tok/s/user)')
-    ax.set_ylabel('Throughput per GPU (tok/s)')
-    ax.legend(title='Hardware + Framework')
-    ax.set_title(f'{model_name} - All Frameworks')
-    fig.tight_layout()
-
-    # Extract model identifier from model name
-    model_id = model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name
-    fig.savefig(f'tput_vs_intvty_{model_id}_{exp_name}.png', bbox_inches='tight')
-    plt.close(fig)
-
-
-# Create one plot per model showing all frameworks and hardware
-# Group results by model family (70b, dsr1, etc.) instead of full model name
-def get_model_family(model_name):
-    if '70b' in model_name.lower() or 'llama-3.3-70b' in model_name.lower():
-        return '70b'
-    elif 'dsr1' in model_name.lower() or 'deepseek-r1' in model_name.lower():
-        return 'dsr1'
-    else:
-        # Fallback to first part of model name
-        return model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name
-
-model_families = set(get_model_family(r.get('model', 'unknown')) for r in results)
-
-for model_family in model_families:
-    # Filter results for this model family
-    model_results = [r for r in results if get_model_family(r.get('model', 'unknown')) == model_family]
-    
-    # Create plots for this model family
-    plot_tput_vs_e2el_for_model(model_results, model_family)
-    plot_tput_vs_intvty_for_model(model_results, model_family)

From efdfcaf14e28e7614c56e7cc4325d2db7ddec182 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 14:08:19 -0500
Subject: [PATCH 062/149] bmk-space -> search-space

---
 .github/configs/amd-master.yaml              |  66 ++++++------
 .github/configs/nvidia-master.yaml           | 108 +++++++++----------
 utils/matrix-logic/generate_sweep_configs.py |  78 +++++++++++---
 3 files changed, 148 insertions(+), 104 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index a501ead63..81c436366 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -7,21 +7,21 @@
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 32, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 32, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 32, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
@@ -36,21 +36,21 @@
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 32, conc-end: 64 }
     - { tp: 2, conc-start: 32, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 64, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 32, conc-end: 64 }
     - { tp: 2, conc-start: 32, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
@@ -65,21 +65,21 @@
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 32, conc-end: 64 }
     - { tp: 2, conc-start: 32, conc-end: 64 }
     - { tp: 4, conc-start: 32, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 32, conc-end: 64 }
     - { tp: 2, conc-start: 32, conc-end: 64 }
     - { tp: 4, conc-start: 64, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 16, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 32 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
@@ -94,21 +94,21 @@
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 32, conc-end: 64 }
     - { tp: 2, conc-start: 32, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 32, conc-end: 64 }
     - { tp: 2, conc-start: 32, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 32, conc-end: 64 }
     - { tp: 2, conc-start: 32, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
@@ -123,16 +123,16 @@ dsr1-fp4-mi355x-sgl:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 dsr1-fp8-mi300x-sgl:
@@ -144,15 +144,15 @@ dsr1-fp8-mi300x-sgl:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 dsr1-fp8-mi325x-sgl:
@@ -164,15 +164,15 @@ dsr1-fp8-mi325x-sgl:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 dsr1-fp8-mi355x-sgl:
@@ -184,15 +184,15 @@ dsr1-fp8-mi355x-sgl:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 gptoss-fp4-mi300x-vllm:
@@ -204,21 +204,21 @@ gptoss-fp4-mi300x-vllm:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 4, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
@@ -233,21 +233,21 @@ gptoss-fp4-mi325x-vllm:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 4, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 64, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 4, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 8 }
     - { tp: 4, conc-start: 4, conc-end: 8 }
@@ -262,19 +262,19 @@ gptoss-fp4-mi355x-vllm:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 16 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 16 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 16 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 5c006dc91..fe9ef989d 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -7,21 +7,21 @@
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 128, conc-end: 128 }
     - { tp: 2, conc-start: 64, conc-end: 128 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 128, conc-end: 128 }
     - { tp: 2, conc-start: 64, conc-end: 128 }
     - { tp: 4, conc-start: 16, conc-end: 128 }
     - { tp: 8, conc-start: 4, conc-end: 32 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 32, conc-end: 128 }
     - { tp: 2, conc-start: 16, conc-end: 128 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
@@ -36,21 +36,21 @@
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 32, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 32, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 32 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 16, conc-end: 64 }
     - { tp: 2, conc-start: 16, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 32 }
@@ -65,21 +65,21 @@
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 128, conc-end: 128 }
     - { tp: 2, conc-start: 64, conc-end: 128 }
     - { tp: 4, conc-start: 4, conc-end: 128 }
     - { tp: 8, conc-start: 4, conc-end: 32 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 128, conc-end: 128 }
     - { tp: 2, conc-start: 64, conc-end: 128 }
     - { tp: 4, conc-start: 16, conc-end: 128 }
     - { tp: 8, conc-start: 4, conc-end: 32 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 32, conc-end: 128 }
     - { tp: 2, conc-start: 16, conc-end: 128 }
     - { tp: 4, conc-start: 4, conc-end: 128 }
@@ -94,21 +94,21 @@
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 32, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 64, conc-end: 64 }
     - { tp: 4, conc-start: 16, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 32, conc-end: 64 }
     - { tp: 2, conc-start: 16, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
@@ -123,19 +123,19 @@
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 2, conc-start: 64, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 2, conc-start: 64, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 2, conc-start: 32, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
@@ -149,21 +149,21 @@
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 128, conc-end: 128 }
     - { tp: 2, conc-start: 64, conc-end: 128 }
     - { tp: 4, conc-start: 4, conc-end: 128 }
     - { tp: 8, conc-start: 4, conc-end: 32 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 128, conc-end: 128 }
     - { tp: 2, conc-start: 64, conc-end: 128 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 32 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 16, conc-end: 128 }
     - { tp: 4, conc-start: 4, conc-end: 128 }
     - { tp: 8, conc-start: 4, conc-end: 32 }
@@ -177,21 +177,21 @@
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 32, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 64, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 16, conc-end: 64 }
     - { tp: 2, conc-start: 16, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
@@ -206,17 +206,17 @@ dsr1-fp4-b200-sgl:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 4, conc-start: 4, conc-end: 128 }
     - { tp: 8, conc-start: 4, conc-end: 128 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 4, conc-start: 4, conc-end: 128 }
     - { tp: 8, conc-start: 4, conc-end: 128 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 4, conc-start: 4, conc-end: 128 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
 
@@ -229,7 +229,7 @@ dsr1-fp4-b200-trt:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     # If TP=4, 
     #   If CONC > 32, then EP=4
     #   If CONC >= 256, DP_ATTN=true
@@ -244,7 +244,7 @@ dsr1-fp4-b200-trt:
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     # If TP=4, 
     #   If CONC > 32, then EP=4
     #   If CONC >= 256, DP_ATTN=true
@@ -259,7 +259,7 @@ dsr1-fp4-b200-trt:
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     # If TP=4, 
     #   If CONC > 32, then EP=4 and DP_ATTN=true
     - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 }
@@ -278,15 +278,15 @@ dsr1-fp8-b200-sgl:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 dsr1-fp8-b200-trt:
@@ -299,18 +299,18 @@ dsr1-fp8-b200-trt:
   # For all sequence lengths, EP=TP
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     # If CONC > 32, then DP_ATTN=true
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     # If CONC > 64, then DP_ATTN=true
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     # If CONC > 64, then DP_ATTN=true
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
 
@@ -323,15 +323,15 @@ dsr1-fp8-h200-sgl:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
 dsr1-fp8-h200-trt:
@@ -345,17 +345,17 @@ dsr1-fp8-h200-trt:
   - isl: 1024
     osl: 1024
     # If CONC > 64, then DP_ATTN=true
-    bmk-space:
+    search-space:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
     # If CONC > 64, then DP_ATTN=true
-    bmk-space:
+    search-space:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     # If CONC > 32, then DP_ATTN=true
-    bmk-space:
+    search-space:
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 }
 
@@ -369,21 +369,21 @@ gptoss-fp4-b200-trt:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 8 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 8 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
@@ -398,21 +398,21 @@ gptoss-fp4-b200-vllm:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 8 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 8 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 4, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
@@ -427,19 +427,19 @@ gptoss-fp4-h100-vllm:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 32 }
@@ -453,21 +453,21 @@ gptoss-fp4-h200-trt:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
     - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, ep: 1, conc-start: 32, conc-end: 64 }
     - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
     - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
@@ -482,21 +482,21 @@ gptoss-fp4-h200-vllm:
   seq-len-configs:
   - isl: 1024
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 4, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 4, conc-end: 16 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
-    bmk-space:
+    search-space:
     - { tp: 1, conc-start: 4, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 8db70fa8b..252c87bf9 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -1,6 +1,8 @@
 import json
 import yaml
 import argparse
+from pydantic import BaseModel, Field, ValidationError, ConfigDict
+from typing import List
 
 seq_len_stoi = {
     "1k1k": (1024, 1024),
@@ -9,6 +11,39 @@
 }
 
 
+class MatrixEntry(BaseModel):
+    """Pydantic model for validating matrix entry structure."""
+    model_config = ConfigDict(extra='forbid', populate_by_name=True)
+
+    image: str
+    model: str
+    precision: str
+    framework: str
+    runner: str
+    isl: int
+    osl: int
+    tp: int
+    ep: int
+    dp_attn: bool = Field(alias='dp-attn')
+    conc: int
+    max_model_len: int = Field(alias='max-model-len')
+    exp_name: str = Field(alias='exp-name')
+
+
+def validate_matrix_output(matrix_values: List[dict]) -> List[dict]:
+    """Validate that matrix_values entries match the expected structure.
+
+    Raises ValueError if any entry fails validation.
+    Returns the original list if all entries are valid.
+    """
+    for i, entry in enumerate(matrix_values):
+        try:
+            MatrixEntry(**entry)
+        except ValidationError as e:
+            raise ValueError(f"Matrix entry at index {i} failed validation:\n{e}")
+    return matrix_values
+
+
 def validate_master_configs_structure(all_config_data):
     """Validate the structure of all master config entries.
 
@@ -57,12 +92,12 @@ def validate_master_configs_structure(all_config_data):
                 raise ValueError(
                     f"'osl' must be int in seq-len-config[{i}] for key '{key}'")
 
-            bmk_space = seq_config.get('bmk-space')
+            bmk_space = seq_config.get('search-space')
             if not bmk_space or not isinstance(bmk_space, list) or len(bmk_space) == 0:
                 raise ValueError(
-                    f"Missing or invalid 'bmk-space' in seq-len-config[{i}] for key '{key}'")
+                    f"Missing or invalid 'search-space' in seq-len-config[{i}] for key '{key}'")
 
-            # Validate each benchmark in bmk-space
+            # Validate each benchmark in search-space
             for j, bmk in enumerate(bmk_space):
                 # Define allowed fields
                 allowed_fields = {'tp', 'conc-start',
@@ -75,23 +110,23 @@ def validate_master_configs_structure(all_config_data):
                 extra_fields = set(bmk.keys()) - allowed_fields
                 if extra_fields:
                     raise ValueError(
-                        f"Extra fields {extra_fields} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
+                        f"Extra fields {extra_fields} in search-space[{j}] of seq-len-config[{i}] for key '{key}'")
 
                 # Validate required fields
                 for field, expected_type in required_bmk_fields.items():
                     if field not in bmk or bmk[field] is None:
                         raise ValueError(
-                            f"Missing '{field}' in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
+                            f"Missing '{field}' in search-space[{j}] of seq-len-config[{i}] for key '{key}'")
                     if not isinstance(bmk[field], expected_type):
                         raise ValueError(
-                            f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
+                            f"'{field}' must be {expected_type.__name__} in search-space[{j}] of seq-len-config[{i}] for key '{key}'")
 
                 # Validate optional fields if they exist
                 for field, expected_type in optional_bmk_fields.items():
                     if field in bmk and bmk[field] is not None:
                         if not isinstance(bmk[field], expected_type):
                             raise ValueError(
-                                f"'{field}' must be {expected_type.__name__} in bmk-space[{j}] of seq-len-config[{i}] for key '{key}'")
+                                f"'{field}' must be {expected_type.__name__} in search-space[{j}] of seq-len-config[{i}] for key '{key}'")
 
 
 def generate_full_sweep(args, all_config_data):
@@ -127,7 +162,7 @@ def generate_full_sweep(args, all_config_data):
         if not matching_seq_config:
             continue  # Skip this config if no matching sequence length
 
-        bmk_space = matching_seq_config['bmk-space']
+        bmk_space = matching_seq_config['search-space']
 
         for bmk in bmk_space:
             tp = bmk['tp']
@@ -149,10 +184,10 @@ def generate_full_sweep(args, all_config_data):
                     'osl': osl,
                     'tp': tp,
                     'conc': conc,
-                    'model-code': model_code,
                     'max-model-len': isl + osl,
                     'ep': 1,  # Default
                     'dp-attn': False,  # Default
+                    'exp-name': f"{model_code}_test",
                 }
 
                 # Add optional fields if they exist
@@ -222,7 +257,7 @@ def generate_test_config(args, all_config_data):
         if seq_lens_filter and (isl, osl) not in seq_lens_filter:
             continue
 
-        bmk_space = seq_config['bmk-space']
+        bmk_space = seq_config['search-space']
 
         for bmk in bmk_space:
             tp = bmk['tp']
@@ -236,15 +271,17 @@ def generate_test_config(args, all_config_data):
                 entry = {
                     'image': image,
                     'model': model,
-                    'model-code': model_code,
                     'precision': precision,
                     'framework': framework,
                     'runner': runner,
                     'isl': isl,
                     'osl': osl,
                     'tp': tp,
+                    'ep': 1, # Default,
+                    'dp-attn': False, # Default
                     'conc': conc_start,
                     'max-model-len': isl + osl,
+                    'exp-name': f"{model_code}_test",
                 }
 
                 # Add optional fields if they exist
@@ -261,13 +298,14 @@ def generate_test_config(args, all_config_data):
                     entry = {
                         'image': image,
                         'model': model,
-                        'model-code': model_code,
                         'precision': precision,
                         'framework': framework,
                         'runner': runner,
                         'isl': isl,
                         'osl': osl,
                         'tp': tp,
+                        'ep': 1, # Default,
+                        'dp-attn': False, # Default
                         'conc': conc,
                         'max-model-len': isl + osl,
                     }
@@ -324,7 +362,7 @@ def generate_runner_model_sweep_config(args, all_config_data):
                 target_config = config
                 break
 
-        highest_tp_bmk = max(target_config['bmk-space'], key=lambda x: x['tp'])
+        highest_tp_bmk = max(target_config['search-space'], key=lambda x: x['tp'])
         # Since we are just testing, pick the highest TP for this config and just test
         # on that TP with the lowest concurrency available
         highest_tp = highest_tp_bmk['tp']
@@ -345,9 +383,11 @@ def generate_runner_model_sweep_config(args, all_config_data):
                 'isl': 1024,
                 'osl': 1024,
                 'tp': highest_tp,
+                'ep': 1, # Default,
+                'dp-attn': False, # Default
                 'conc': lowest_conc,
-                'model-code': model_code,
                 'max-model-len': 2048,
+                'exp-name': f"{model_code}_test",
             }
 
             # Add optional fields if they exist
@@ -397,7 +437,6 @@ def generate_custom_test(args):
             'ep': 1,
             'dp-attn': False,
             'conc': 4,
-            'model-code': args.model,
             'exp-name': args.exp_name,
             'max-model-len': 2048,
         }
@@ -442,7 +481,7 @@ def generate_runner_sweep_config(args, all_config_data):
                 target_config = config
                 break
 
-        highest_tp_bmk = max(target_config['bmk-space'], key=lambda x: x['tp'])
+        highest_tp_bmk = max(target_config['search-space'], key=lambda x: x['tp'])
         # Since we are just testing, pick the highest TP for this config and just test
         # on that TP with the lowest concurrency available
         highest_tp = highest_tp_bmk['tp']
@@ -463,8 +502,10 @@ def generate_runner_sweep_config(args, all_config_data):
                 'isl': 1024,
                 'osl': 1024,
                 'tp': highest_tp,
+                'ep': 1, # Default,
+                'dp-attn': False, # Default
                 'conc': lowest_conc,
-                'model-code': model_code,
+                'exp-name': f"{model_code}_test",
                 'max-model-len': 2048,
             }
 
@@ -737,6 +778,9 @@ def main():
     else:
         parser.error(f"Unknown command: {args.command}")
 
+    # Validate output before printing
+    validate_matrix_output(matrix_values)
+
     print(json.dumps(matrix_values))
     return matrix_values
 

From 15eead5a1998950e038261596e9fe9f11b5b4b89 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 14:10:32 -0500
Subject: [PATCH 063/149] updating exp name for full sweep

---
 utils/matrix-logic/generate_sweep_configs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 252c87bf9..ed68a9633 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -187,7 +187,7 @@ def generate_full_sweep(args, all_config_data):
                     'max-model-len': isl + osl,
                     'ep': 1,  # Default
                     'dp-attn': False,  # Default
-                    'exp-name': f"{model_code}_test",
+                    'exp-name': f"{model_code}_{isl}_{osl}_sweep",
                 }
 
                 # Add optional fields if they exist

From 6bbc02859d119c99887cd3e5281264b0e2cb3db5 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 14:12:55 -0500
Subject: [PATCH 064/149] pip install pydantic

---
 .github/workflows/1k1k-sweep.yml | 3 +++
 .github/workflows/test.yml       | 1 +
 2 files changed, 4 insertions(+)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index dd8ae9f9c..958fd73b9 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -21,6 +21,7 @@ jobs:
 
             - id: get-70b-configs
               run: |
+                  pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
@@ -34,6 +35,7 @@ jobs:
 
             - id: get-dsr1-configs
               run: |
+                  pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
@@ -47,6 +49,7 @@ jobs:
 
             - id: get-gptoss-configs
               run: |
+                  pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 9536d6db1..78b9b1f5e 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -23,6 +23,7 @@ jobs:
 
             - id: get-jobs
               run: |
+                  pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py ${{ inputs.generate-cli-command }})
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 

From b84fffe77c8305116f52f6c50e8f88e32293c1d0 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 14:31:12 -0500
Subject: [PATCH 065/149] add filtered sweep

---
 utils/matrix-logic/generate_sweep_configs.py | 192 ++++++++++++++++++-
 1 file changed, 191 insertions(+), 1 deletion(-)

diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index ed68a9633..bf8ccc065 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -184,7 +184,7 @@ def generate_full_sweep(args, all_config_data):
                     'osl': osl,
                     'tp': tp,
                     'conc': conc,
-                    'max-model-len': isl + osl,
+                    'max-model-len': isl + osl + 200,
                     'ep': 1,  # Default
                     'dp-attn': False,  # Default
                     'exp-name': f"{model_code}_{isl}_{osl}_sweep",
@@ -207,6 +207,143 @@ def generate_full_sweep(args, all_config_data):
     return matrix_values
 
 
+def generate_filtered_sweep(args, all_config_data):
+    """Generate sweep configurations with filtering options.
+
+    Allows filtering by model prefix, precision, framework, runner type, and sequence lengths.
+    Supports test mode to only run highest TP with lowest concurrency.
+
+    Assumes all_config_data has been validated by validate_config_structure().
+    """
+    matrix_values = []
+
+    # Convert seq-lens to set of (isl, osl) tuples for filtering
+    seq_lens_filter = None
+    if args.seq_lens:
+        seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens}
+
+    for key, val in all_config_data.items():
+        # Filter by model prefix if specified
+        if args.model_prefix and not key.startswith(args.model_prefix):
+            continue
+
+        # Filter by precision if specified
+        if args.precision and val['precision'] != args.precision:
+            continue
+
+        # Filter by framework if specified
+        if args.framework and val['framework'] != args.framework:
+            continue
+
+        # Filter by runner type if specified
+        if args.runner_type and val['runner'] != args.runner_type:
+            continue
+
+        seq_len_configs = val['seq-len-configs']
+        image = val['image']
+        model = val['model']
+        precision = val['precision']
+        framework = val['framework']
+        runner = val['runner']
+        model_code = key.split('-')[0]
+
+        for seq_config in seq_len_configs:
+            isl = seq_config['isl']
+            osl = seq_config['osl']
+
+            # Filter by sequence lengths if specified
+            if seq_lens_filter and (isl, osl) not in seq_lens_filter:
+                continue
+
+            bmk_space = seq_config['search-space']
+
+            if args.test_mode:
+                # In test mode, use highest TP with lowest concurrency
+                highest_tp_bmk = max(bmk_space, key=lambda x: x['tp'])
+                tp = highest_tp_bmk['tp']
+                conc = highest_tp_bmk['conc-start']
+                ep = highest_tp_bmk.get('ep')
+                dp_attn = highest_tp_bmk.get('dp-attn')
+
+                entry = {
+                    'image': image,
+                    'model': model,
+                    'precision': precision,
+                    'framework': framework,
+                    'runner': runner,
+                    'isl': isl,
+                    'osl': osl,
+                    'tp': tp,
+                    'ep': 1,  # Default
+                    'dp-attn': False,  # Default
+                    'conc': conc,
+                    'max-model-len': isl + osl + 200,
+                    'exp-name': f"{model_code}_{isl}_{osl}_test",
+                }
+
+                if ep is not None:
+                    entry['ep'] = ep
+                if dp_attn is not None:
+                    entry['dp-attn'] = dp_attn
+
+                matrix_values.append(entry)
+            else:
+                # Full sweep mode
+                for bmk in bmk_space:
+                    tp = bmk['tp']
+                    conc_start = bmk['conc-start']
+                    conc_end = bmk['conc-end']
+                    ep = bmk.get('ep')
+                    dp_attn = bmk.get('dp-attn')
+
+                    conc = conc_start
+                    while conc <= conc_end:
+                        entry = {
+                            'image': image,
+                            'model': model,
+                            'precision': precision,
+                            'framework': framework,
+                            'runner': runner,
+                            'isl': isl,
+                            'osl': osl,
+                            'tp': tp,
+                            'conc': conc,
+                            'max-model-len': isl + osl + 200,
+                            'ep': 1,  # Default
+                            'dp-attn': False,  # Default
+                            'exp-name': f"{model_code}_{isl}_{osl}_sweep",
+                        }
+
+                        if ep is not None:
+                            entry['ep'] = ep
+                        if dp_attn is not None:
+                            entry['dp-attn'] = dp_attn
+
+                        matrix_values.append(entry)
+
+                        if conc == conc_end:
+                            break
+                        conc *= args.step_size
+                        if conc > conc_end:
+                            conc = conc_end
+
+    if len(matrix_values) == 0:
+        error_msg = "No configs found matching filters:"
+        if args.model_prefix:
+            error_msg += f" model-prefix='{args.model_prefix}'"
+        if args.precision:
+            error_msg += f" precision='{args.precision}'"
+        if args.framework:
+            error_msg += f" framework='{args.framework}'"
+        if args.runner_type:
+            error_msg += f" runner-type='{args.runner_type}'"
+        if seq_lens_filter:
+            error_msg += f" seq-lens={list(args.seq_lens)}"
+        raise ValueError(error_msg)
+
+    return matrix_values
+
+
 def generate_test_config(args, all_config_data):
     """Generate test configurations for a specific key.
 
@@ -606,6 +743,57 @@ def main():
         help='Show this help message and exit'
     )
 
+    # Subcommand: filtered-sweep
+    filtered_sweep_parser = subparsers.add_parser(
+        'filtered-sweep',
+        parents=[parent_parser],
+        add_help=False,
+        help='Generate sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths'
+    )
+    filtered_sweep_parser.add_argument(
+        '--model-prefix',
+        required=False,
+        help='Model prefix to filter configurations (optional)'
+    )
+    filtered_sweep_parser.add_argument(
+        '--precision',
+        required=False,
+        help='Precision to filter by (e.g., fp4, fp8) (optional)'
+    )
+    filtered_sweep_parser.add_argument(
+        '--framework',
+        required=False,
+        help='Framework to filter by (e.g., vllm, trt, sglang) (optional)'
+    )
+    filtered_sweep_parser.add_argument(
+        '--runner-type',
+        required=False,
+        help='Runner type to filter by (e.g., h200, h100) (optional)'
+    )
+    filtered_sweep_parser.add_argument(
+        '--seq-lens',
+        nargs='+',
+        choices=list(seq_len_stoi.keys()),
+        required=False,
+        help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
+    )
+    filtered_sweep_parser.add_argument(
+        '--step-size',
+        type=int,
+        default=2,
+        help='Step size for concurrency values (default: 2)'
+    )
+    filtered_sweep_parser.add_argument(
+        '--test-mode',
+        action='store_true',
+        help='Test mode: only run highest TP with lowest concurrency for each matching config'
+    )
+    filtered_sweep_parser.add_argument(
+        '-h', '--help',
+        action='help',
+        help='Show this help message and exit'
+    )
+
     # Subcommand: test-config
     test_config_parser = subparsers.add_parser(
         'test-config',
@@ -765,6 +953,8 @@ def main():
     # Route to appropriate function based on subcommand
     if args.command == 'full-sweep':
         matrix_values = generate_full_sweep(args, all_config_data)
+    elif args.command == 'filtered-sweep':
+        matrix_values = generate_filtered_sweep(args, all_config_data)
     elif args.command == 'test-config':
         matrix_values = generate_test_config(args, all_config_data)
     elif args.command == 'runner-model-sweep':

From df8877dfbe8705ff63f17b49cb9a49837c205968 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 14:39:56 -0500
Subject: [PATCH 066/149] allow multiple filter values

---
 utils/matrix-logic/generate_sweep_configs.py | 58 +++++++++++++++-----
 1 file changed, 44 insertions(+), 14 deletions(-)

diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index bf8ccc065..9bfc2ac1f 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -215,6 +215,26 @@ def generate_filtered_sweep(args, all_config_data):
 
     Assumes all_config_data has been validated by validate_config_structure().
     """
+    # Validate runner types if specified
+    if args.runner_type:
+        if not args.runner_config:
+            raise ValueError(
+                "--runner-config is required when --runner-type is specified")
+
+        try:
+            with open(args.runner_config, 'r') as f:
+                runner_config = yaml.safe_load(f)
+        except FileNotFoundError:
+            raise ValueError(
+                f"Runner config file '{args.runner_config}' does not exist.")
+
+        valid_runner_types = set(runner_config.keys())
+        invalid_runners = set(args.runner_type) - valid_runner_types
+        if invalid_runners:
+            raise ValueError(
+                f"Invalid runner type(s): {invalid_runners}. "
+                f"Valid runner types are: {', '.join(sorted(valid_runner_types))}")
+
     matrix_values = []
 
     # Convert seq-lens to set of (isl, osl) tuples for filtering
@@ -224,19 +244,20 @@ def generate_filtered_sweep(args, all_config_data):
 
     for key, val in all_config_data.items():
         # Filter by model prefix if specified
-        if args.model_prefix and not key.startswith(args.model_prefix):
-            continue
+        if args.model_prefix:
+            if not any(key.startswith(prefix) for prefix in args.model_prefix):
+                continue
 
         # Filter by precision if specified
-        if args.precision and val['precision'] != args.precision:
+        if args.precision and val['precision'] not in args.precision:
             continue
 
         # Filter by framework if specified
-        if args.framework and val['framework'] != args.framework:
+        if args.framework and val['framework'] not in args.framework:
             continue
 
         # Filter by runner type if specified
-        if args.runner_type and val['runner'] != args.runner_type:
+        if args.runner_type and val['runner'] not in args.runner_type:
             continue
 
         seq_len_configs = val['seq-len-configs']
@@ -330,15 +351,15 @@ def generate_filtered_sweep(args, all_config_data):
     if len(matrix_values) == 0:
         error_msg = "No configs found matching filters:"
         if args.model_prefix:
-            error_msg += f" model-prefix='{args.model_prefix}'"
+            error_msg += f" model-prefix={args.model_prefix}"
         if args.precision:
-            error_msg += f" precision='{args.precision}'"
+            error_msg += f" precision={args.precision}"
         if args.framework:
-            error_msg += f" framework='{args.framework}'"
+            error_msg += f" framework={args.framework}"
         if args.runner_type:
-            error_msg += f" runner-type='{args.runner_type}'"
+            error_msg += f" runner-type={args.runner_type}"
         if seq_lens_filter:
-            error_msg += f" seq-lens={list(args.seq_lens)}"
+            error_msg += f" seq-lens={args.seq_lens}"
         raise ValueError(error_msg)
 
     return matrix_values
@@ -752,23 +773,32 @@ def main():
     )
     filtered_sweep_parser.add_argument(
         '--model-prefix',
+        nargs='+',
         required=False,
-        help='Model prefix to filter configurations (optional)'
+        help='Model prefix(es) to filter configurations (optional, can specify multiple)'
     )
     filtered_sweep_parser.add_argument(
         '--precision',
+        nargs='+',
         required=False,
-        help='Precision to filter by (e.g., fp4, fp8) (optional)'
+        help='Precision(s) to filter by (e.g., fp4, fp8) (optional, can specify multiple)'
     )
     filtered_sweep_parser.add_argument(
         '--framework',
+        nargs='+',
         required=False,
-        help='Framework to filter by (e.g., vllm, trt, sglang) (optional)'
+        help='Framework(s) to filter by (e.g., vllm, trt, sglang) (optional, can specify multiple)'
     )
     filtered_sweep_parser.add_argument(
         '--runner-type',
+        nargs='+',
+        required=False,
+        help='Runner type(s) to filter by (e.g., h200, h100) (optional, can specify multiple)'
+    )
+    filtered_sweep_parser.add_argument(
+        '--runner-config',
         required=False,
-        help='Runner type to filter by (e.g., h200, h100) (optional)'
+        help='Configuration file holding runner information (required if --runner-type is specified)'
     )
     filtered_sweep_parser.add_argument(
         '--seq-lens',

From b0aaf6a6a93efda49738d65e595a62bb61e8365b Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 14:45:02 -0500
Subject: [PATCH 067/149] reverse seq len mapping

---
 utils/matrix-logic/generate_sweep_configs.py | 21 +++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 9bfc2ac1f..7f1b76490 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -10,6 +10,18 @@
     "8k1k": (8192, 1024)
 }
 
+# Reverse mapping for exp-name generation
+seq_len_itos = {v: k for k, v in seq_len_stoi.items()}
+
+
+def seq_len_to_str(isl: int, osl: int) -> str:
+    """Convert sequence lengths to short string representation.
+
+    Returns the short name (e.g., '1k1k') if it exists in the mapping,
+    otherwise returns 'isl_osl' format.
+    """
+    return seq_len_itos.get((isl, osl), f"{isl}_{osl}")
+
 
 class MatrixEntry(BaseModel):
     """Pydantic model for validating matrix entry structure."""
@@ -174,6 +186,7 @@ def generate_full_sweep(args, all_config_data):
             # Generate entries for each concurrency value in the range
             conc = conc_start
             while conc <= conc_end:
+                seq_len_str = seq_len_to_str(isl, osl)
                 entry = {
                     'image': image,
                     'model': model,
@@ -187,7 +200,7 @@ def generate_full_sweep(args, all_config_data):
                     'max-model-len': isl + osl + 200,
                     'ep': 1,  # Default
                     'dp-attn': False,  # Default
-                    'exp-name': f"{model_code}_{isl}_{osl}_sweep",
+                    'exp-name': f"{model_code}_{seq_len_str}_sweep",
                 }
 
                 # Add optional fields if they exist
@@ -286,6 +299,7 @@ def generate_filtered_sweep(args, all_config_data):
                 ep = highest_tp_bmk.get('ep')
                 dp_attn = highest_tp_bmk.get('dp-attn')
 
+                seq_len_str = seq_len_to_str(isl, osl)
                 entry = {
                     'image': image,
                     'model': model,
@@ -299,7 +313,7 @@ def generate_filtered_sweep(args, all_config_data):
                     'dp-attn': False,  # Default
                     'conc': conc,
                     'max-model-len': isl + osl + 200,
-                    'exp-name': f"{model_code}_{isl}_{osl}_test",
+                    'exp-name': f"{model_code}_{seq_len_str}_test",
                 }
 
                 if ep is not None:
@@ -319,6 +333,7 @@ def generate_filtered_sweep(args, all_config_data):
 
                     conc = conc_start
                     while conc <= conc_end:
+                        seq_len_str = seq_len_to_str(isl, osl)
                         entry = {
                             'image': image,
                             'model': model,
@@ -332,7 +347,7 @@ def generate_filtered_sweep(args, all_config_data):
                             'max-model-len': isl + osl + 200,
                             'ep': 1,  # Default
                             'dp-attn': False,  # Default
-                            'exp-name': f"{model_code}_{isl}_{osl}_sweep",
+                            'exp-name': f"{model_code}_{seq_len_str}_sweep",
                         }
 
                         if ep is not None:

From de9e367123ce3017aa6a64e3db4917d220a9b422 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 14:47:43 -0500
Subject: [PATCH 068/149] less verbose

---
 utils/matrix-logic/generate_sweep_configs.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 7f1b76490..6d092eac8 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -200,7 +200,7 @@ def generate_full_sweep(args, all_config_data):
                     'max-model-len': isl + osl + 200,
                     'ep': 1,  # Default
                     'dp-attn': False,  # Default
-                    'exp-name': f"{model_code}_{seq_len_str}_sweep",
+                    'exp-name': f"{model_code}_{seq_len_str}",
                 }
 
                 # Add optional fields if they exist
@@ -313,7 +313,7 @@ def generate_filtered_sweep(args, all_config_data):
                     'dp-attn': False,  # Default
                     'conc': conc,
                     'max-model-len': isl + osl + 200,
-                    'exp-name': f"{model_code}_{seq_len_str}_test",
+                    'exp-name': f"{model_code}_{seq_len_str}",
                 }
 
                 if ep is not None:
@@ -347,7 +347,7 @@ def generate_filtered_sweep(args, all_config_data):
                             'max-model-len': isl + osl + 200,
                             'ep': 1,  # Default
                             'dp-attn': False,  # Default
-                            'exp-name': f"{model_code}_{seq_len_str}_sweep",
+                            'exp-name': f"{model_code}_{seq_len_str}",
                         }
 
                         if ep is not None:

From 6df2657dfaa2f81f161fcf577c55b4af3375b483 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 17:05:27 -0500
Subject: [PATCH 069/149] deleting files

---
 .github/workflows/{test.yml => e2e-tests.yml} |    2 +-
 .../workflows/full-sweep-1k1k-scheduler.yml   |   59 -
 .../workflows/full-sweep-1k8k-scheduler.yml   |   59 -
 .../workflows/full-sweep-8k1k-scheduler.yml   |   59 -
 .github/workflows/full-sweep-test.yml         |   89 -
 .github/workflows/full-sweep-tmpl.yml         |  263 ---
 .github/workflows/runner-model-sweep-test.yml |  300 ----
 .github/workflows/runner-sweep-test.yml       |  333 ----
 .github/workflows/runner-test.yml             |  136 --
 .gitignore                                    |    2 +
 utils/matrix-logic/generate_sweep_configs.py  |  135 +-
 utils/matrix-logic/pytest.ini                 |   12 +
 .../test_generate_sweep_configs.py            | 1545 +++++++++++++++++
 13 files changed, 1573 insertions(+), 1421 deletions(-)
 rename .github/workflows/{test.yml => e2e-tests.yml} (99%)
 delete mode 100644 .github/workflows/full-sweep-1k1k-scheduler.yml
 delete mode 100644 .github/workflows/full-sweep-1k8k-scheduler.yml
 delete mode 100644 .github/workflows/full-sweep-8k1k-scheduler.yml
 delete mode 100644 .github/workflows/full-sweep-test.yml
 delete mode 100644 .github/workflows/full-sweep-tmpl.yml
 delete mode 100644 .github/workflows/runner-model-sweep-test.yml
 delete mode 100644 .github/workflows/runner-sweep-test.yml
 delete mode 100644 .github/workflows/runner-test.yml
 create mode 100644 .gitignore
 create mode 100644 utils/matrix-logic/pytest.ini
 create mode 100644 utils/matrix-logic/test_generate_sweep_configs.py

diff --git a/.github/workflows/test.yml b/.github/workflows/e2e-tests.yml
similarity index 99%
rename from .github/workflows/test.yml
rename to .github/workflows/e2e-tests.yml
index 78b9b1f5e..ff7ecb92b 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -1,4 +1,4 @@
-name: Test Sweep
+name: End-to-End Tests
 
 # concurrency:
 #     group: benchmark-lock
diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml
deleted file mode 100644
index 601c760b3..000000000
--- a/.github/workflows/full-sweep-1k1k-scheduler.yml
+++ /dev/null
@@ -1,59 +0,0 @@
-name: Full Sweep Scheduler - 1k1k
-
-concurrency:
-  group: benchmark-lock-1k1k
-  cancel-in-progress: true
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 23 * * *'
-
-jobs:
-  mega-run:
-    uses: ./.github/workflows/full-sweep-tmpl.yml
-    secrets: inherit
-    with:
-      run_1k1k: true
-      run_8k1k: false
-      run_1k8k: false
-      use_h100: true
-      use_h200: true
-      use_b200: true
-      use_mi300x: true
-      use_mi325x: true
-      use_mi355x: true
-      use_gb200: true
-
-  calc-success-rate:
-    needs: mega-run
-    if: ${{ always() }}
-    runs-on: ubuntu-latest
-
-    env:
-      RESULTS_DIR: "results/"
-      STATS_FILENAME: "run_stats"
-      GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
-
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          token: ${{ secrets.REPO_PAT }}
-          fetch-depth: 0
-
-      - name: Download results artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: ${{ env.RESULTS_DIR }}
-          pattern: results_*
-
-      - name: Install python dependencies
-        run: pip install PyGithub
-
-      - name: Calculate success rate
-        run: python3 utils/calc_success_rate.py $STATS_FILENAME
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: "run-stats"
-          path: ${{ env.STATS_FILENAME }}.json
diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml
deleted file mode 100644
index 967935335..000000000
--- a/.github/workflows/full-sweep-1k8k-scheduler.yml
+++ /dev/null
@@ -1,59 +0,0 @@
-name: Full Sweep Scheduler - 1k8k
-
-concurrency:
-  group: benchmark-lock-1k8k
-  cancel-in-progress: true
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 23 * * *'
-
-jobs:
-  mega-run:
-    uses: ./.github/workflows/full-sweep-tmpl.yml
-    secrets: inherit
-    with:
-      run_1k1k: false
-      run_8k1k: false
-      run_1k8k: true
-      use_h100: true
-      use_h200: true
-      use_b200: true
-      use_mi300x: true
-      use_mi325x: true
-      use_mi355x: true
-      use_gb200: true
-
-  calc-success-rate:
-    needs: mega-run
-    if: ${{ always() }}
-    runs-on: ubuntu-latest
-
-    env:
-      RESULTS_DIR: "results/"
-      STATS_FILENAME: "run_stats"
-      GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
-
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          token: ${{ secrets.REPO_PAT }}
-          fetch-depth: 0
-
-      - name: Download results artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: ${{ env.RESULTS_DIR }}
-          pattern: results_*
-
-      - name: Install python dependencies
-        run: pip install PyGithub
-
-      - name: Calculate success rate
-        run: python3 utils/calc_success_rate.py $STATS_FILENAME
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: "run-stats"
-          path: ${{ env.STATS_FILENAME }}.json
diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml
deleted file mode 100644
index 791d9e017..000000000
--- a/.github/workflows/full-sweep-8k1k-scheduler.yml
+++ /dev/null
@@ -1,59 +0,0 @@
-name: Full Sweep Scheduler - 8k1k
-
-concurrency:
-  group: benchmark-lock-8k1k
-  cancel-in-progress: true
-
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 23 * * *'
-
-jobs:
-  mega-run:
-    uses: ./.github/workflows/full-sweep-tmpl.yml
-    secrets: inherit
-    with:
-      run_1k1k: false
-      run_8k1k: true
-      run_1k8k: false
-      use_h100: true
-      use_h200: true
-      use_b200: true
-      use_mi300x: true
-      use_mi325x: true
-      use_mi355x: true
-      use_gb200: true
-
-  calc-success-rate:
-    needs: mega-run
-    if: ${{ always() }}
-    runs-on: ubuntu-latest
-
-    env:
-      RESULTS_DIR: "results/"
-      STATS_FILENAME: "run_stats"
-      GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
-
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          token: ${{ secrets.REPO_PAT }}
-          fetch-depth: 0
-
-      - name: Download results artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: ${{ env.RESULTS_DIR }}
-          pattern: results_*
-
-      - name: Install python dependencies
-        run: pip install PyGithub
-
-      - name: Calculate success rate
-        run: python3 utils/calc_success_rate.py $STATS_FILENAME
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: "run-stats"
-          path: ${{ env.STATS_FILENAME }}.json
diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
deleted file mode 100644
index b134e407c..000000000
--- a/.github/workflows/full-sweep-test.yml
+++ /dev/null
@@ -1,89 +0,0 @@
-name: Test - Full Sweep
-
-concurrency:
-  group: benchmark-lock
-  cancel-in-progress: false
-
-on:
-  workflow_dispatch:
-    inputs:
-      run_1k1k:
-        type: boolean
-        required: false
-      run_8k1k:
-        type: boolean
-        required: false
-      run_1k8k:
-        type: boolean
-        required: false
-
-      use_h100:
-        type: boolean
-        required: false
-      use_h200:
-        type: boolean
-        required: false
-      use_b200:
-        type: boolean
-        required: false
-      use_mi300x:
-        type: boolean
-        required: false
-      use_mi325x:
-        type: boolean
-        required: false
-      use_mi355x:
-        type: boolean
-        required: false
-      use_gb200:
-        type: boolean
-        required: false
-
-jobs:
-  mega-test-run:
-    uses: ./.github/workflows/full-sweep-tmpl.yml
-    secrets: inherit
-    with:
-      run_1k1k: ${{ inputs.run_1k1k }}
-      run_8k1k: ${{ inputs.run_8k1k }}
-      run_1k8k: ${{ inputs.run_1k8k }}
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-      use_gb200: ${{ inputs.use_gb200 }}
-
-  calc-success-rate:
-    needs: mega-test-run
-    if: ${{ always() }}
-    runs-on: ubuntu-latest
-
-    env:
-      RESULTS_DIR: "results/"
-      STATS_FILENAME: "run_stats"
-      GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
-
-    steps:
-      - uses: actions/checkout@v3
-        with:
-          token: ${{ secrets.REPO_PAT }}
-          fetch-depth: 0
-
-      - name: Download results artifacts
-        uses: actions/download-artifact@v4
-        with:
-          path: ${{ env.RESULTS_DIR }}
-          pattern: results_*
-
-      - name: Install python dependencies
-        run: pip install PyGithub
-
-      - name: Calculate success rate
-        run: python3 utils/calc_success_rate.py $STATS_FILENAME
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: "run-stats"
-          path: ${{ env.STATS_FILENAME }}.json
diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml
deleted file mode 100644
index b086460df..000000000
--- a/.github/workflows/full-sweep-tmpl.yml
+++ /dev/null
@@ -1,263 +0,0 @@
-name: Template - Full Sweep
-
-on:
-  workflow_call:
-    inputs:
-      run_1k1k:
-        type: boolean
-        required: true
-      run_8k1k:
-        type: boolean
-        required: true
-      run_1k8k:
-        type: boolean
-        required: true
-
-      use_h100:
-        type: boolean
-        required: true
-      use_h200:
-        type: boolean
-        required: true
-      use_b200:
-        type: boolean
-        required: true
-      use_mi300x:
-        type: boolean
-        required: true
-      use_mi325x:
-        type: boolean
-        required: true
-      use_mi355x:
-        type: boolean
-        required: true
-      use_gb200:
-        type: boolean
-        required: false
-        default: false
-
-jobs:
-  _70b-1k1k:
-    if: ${{ inputs.run_1k1k }}
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-
-  collect-70b-1k1k-results:
-    needs: _70b-1k1k
-    if: ${{ inputs.run_1k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k1k'
-
-  dsr1-1k1k:
-    if: ${{ inputs.run_1k1k }}
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-      use_gb200: ${{ inputs.use_gb200 }}
-
-  collect-dsr1-1k1k-results:
-    needs: dsr1-1k1k
-    if: ${{ inputs.run_1k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k1k'
-
-  gptoss-1k1k:
-    if: ${{ inputs.run_1k1k }}
-    uses: ./.github/workflows/gptoss-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-
-  collect-gptoss-1k1k-results:
-    needs: gptoss-1k1k
-    if: ${{ inputs.run_1k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_1k1k'
-
-  _70b-8k1k:
-    if: ${{ inputs.run_8k1k }}
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-
-  collect-70b-8k1k-results:
-    needs: _70b-8k1k
-    if: ${{ inputs.run_8k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_8k1k'
-
-  dsr1-8k1k:
-    if: ${{ inputs.run_8k1k }}
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-      use_gb200: ${{ inputs.use_gb200 }}
-
-  collect-dsr1-8k1k-results:
-    needs: dsr1-8k1k
-    if: ${{ inputs.run_8k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_8k1k'
-
-  gptoss-8k1k:
-    if: ${{ inputs.run_8k1k }}
-    uses: ./.github/workflows/gptoss-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-
-  collect-gptoss-8k1k-results:
-    needs: gptoss-8k1k
-    if: ${{ inputs.run_8k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_8k1k'
-
-  _70b-1k8k:
-    if: ${{ inputs.run_1k8k }}
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-
-  collect-70b-1k8k-results:
-    needs: _70b-1k8k
-    if: ${{ inputs.run_1k8k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k8k'
-
-  dsr1-1k8k:
-    if: ${{ inputs.run_1k8k }}
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-      use_gb200: ${{ inputs.use_gb200 }}
-
-  collect-dsr1-1k8k-results:
-    needs: dsr1-1k8k
-    if: ${{ inputs.run_1k8k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k8k'
-
-  gptoss-1k8k:
-    if: ${{ inputs.run_1k8k }}
-    uses: ./.github/workflows/gptoss-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-
-  collect-gptoss-1k8k-results:
-    needs: gptoss-1k8k
-    if: ${{ inputs.run_1k8k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_1k8k'
diff --git a/.github/workflows/runner-model-sweep-test.yml b/.github/workflows/runner-model-sweep-test.yml
deleted file mode 100644
index 212ffc07c..000000000
--- a/.github/workflows/runner-model-sweep-test.yml
+++ /dev/null
@@ -1,300 +0,0 @@
-name: 'Test - Runner Model Sweep'
-run-name: '${{ github.event.inputs.runner }} Sweep'
-on:
-  workflow_dispatch:
-    inputs:
-      runner:
-        description: 'Runner Type'
-        required: true
-        type: choice
-        options:
-          - 'h100'
-          - 'h200'
-          - 'h200-trt'
-          - 'b200'
-          - 'b200-trt'
-          - 'mi300x'
-          - 'mi325x'
-          - 'mi355x'
-
-env:
-  HF_TOKEN: ${{ secrets.HF_TOKEN }}
-  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
-
-jobs:
-  bmk-h100:
-    if: ${{ inputs.runner == 'h100' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'h100-cr_0'
-          - 'h100-cr_1'
-          - 'h100-cw_0'
-          - 'h100-cw_1'
-        config:
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk-h200:
-    if: ${{ inputs.runner == 'h200' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'h200-cw_0'
-          - 'h200-cw_1'
-          - 'h200-nb_0'
-          - 'h200-nb_1'
-          - 'h200-nb_2'
-          - 'h200-nb_3'
-          - 'h200-nv_0'
-          - 'h200-nv_1'
-          - 'h200-nv_2'
-          - 'h200-nv_3'
-        config:
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
-          - { image: 'lmsysorg/sglang:v0.5.2rc2-cu126', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk-h200-trt:
-    if: ${{ inputs.runner == 'h200-trt' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'h200-cw_0'
-          - 'h200-cw_1'
-          - 'h200-nb_0'
-          - 'h200-nb_1'
-          - 'h200-nb_2'
-          - 'h200-nb_3'
-          - 'h200-nv_0'
-          - 'h200-nv_1'
-          - 'h200-nv_2'
-          - 'h200-nv_3'
-        config:
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' }
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk-b200:
-    if: ${{ inputs.runner == 'b200' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'b200-nvd_0'
-          - 'b200-nvd_1'
-          - 'b200-nvd_2'
-          - 'b200-nvd_3'
-        config:
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' }
-          - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[4]'
-
-  bmk-b200-trt:
-    if: ${{ inputs.runner == 'b200-trt' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'b200-nv_0'
-          - 'b200-nv_1'
-          - 'b200-nb_0'
-          - 'b200-nb_1'
-        config:
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' }
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'trt', precision: 'fp4', exp-name: '70b_test' }
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'trt', precision: 'fp4', exp-name: 'dsr1_test' }
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk-mi300x:
-    if: ${{ inputs.runner == 'mi300x' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'mi300x-amd_0'
-          - 'mi300x-amd_1'
-          - 'mi300x-amd_2'
-          - 'mi300x-amd_3'
-          - 'mi300x-amd_4'
-          - 'mi300x-cr_0'
-          - 'mi300x-oci_0'
-        config:
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk-mi325x:
-    if: ${{ inputs.runner == 'mi325x' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'mi325x-amd_0'
-          - 'mi325x-tw_0'
-          - 'mi325x-tw_1'
-          - 'mi325x-tw_2'
-          - 'mi325x-tw_3'
-        config:
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk-mi355x:
-    if: ${{ inputs.runner == 'mi355x' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'mi355x-amd_0'
-          - 'mi355x-amd_1'
-          - 'mi355x-amd_2'
-          - 'mi355x-amd_3'
-        config:
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' }
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
diff --git a/.github/workflows/runner-sweep-test.yml b/.github/workflows/runner-sweep-test.yml
deleted file mode 100644
index fd100474f..000000000
--- a/.github/workflows/runner-sweep-test.yml
+++ /dev/null
@@ -1,333 +0,0 @@
-name: 'Test - Runner Sweep'
-run-name: '${{ github.event.inputs.runner }} Sweep - ${{ github.event.inputs.model }}'
-on:
-  workflow_dispatch:
-    inputs:
-      runner:
-        description: 'Runner Type'
-        required: true
-        type: choice
-        options:
-          - 'h100'
-          - 'h200'
-          - 'b200'
-          - 'h200-trt'
-          - 'b200-trt'
-          - 'mi300x'
-          - 'mi325x'
-          - 'mi355x'
-          - 'gb200'
-
-      image:
-        description: 'Docker Image'
-        required: true
-        type: choice
-        options:
-          - 'lmsysorg/sglang:v0.4.9.post1-cu126'
-          - 'lmsysorg/sglang:v0.5.0rc1-cu128-b200'
-          - 'lmsysorg/sglang:v0.5.2rc2-cu126'
-          - 'lmsysorg/sglang:v0.5.3rc1-cu129-b200'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev'
-          - 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3'
-          - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915'
-          - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915'
-          - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250915'
-          - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-          - 'vllm/vllm-openai:v0.10.2'
-
-      model:
-        description: 'Model'
-        required: true
-        type: choice
-        options:
-          - 'amd/DeepSeek-R1-0528-MXFP4-Preview'
-          - 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-          - 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview'
-          - 'deepseek-ai/DeepSeek-R1-0528'
-          - 'nvidia/Llama-3.3-70B-Instruct-FP8'
-          - 'nvidia/Llama-3.3-70B-Instruct-FP4'
-          - 'nvidia/DeepSeek-R1-0528-FP4'
-          - 'nvidia/DeepSeek-R1-0528-FP4-v2'
-          - 'openai/gpt-oss-120b'
-
-      framework:
-        description: 'Framework'
-        required: true
-        type: choice
-        options:
-          - 'vllm'
-          - 'sglang'
-          - 'trt'
-
-      precision:
-        description: 'Precision'
-        required: true
-        type: choice
-        options:
-          - 'fp8'
-          - 'fp4'
-
-      exp-name:
-        description: 'Experiment Name'
-        required: true
-        type: choice
-        options:
-          - '70b_test'
-          - 'dsr1_test'
-          - 'gptoss_test'
-
-
-env:
-  HF_TOKEN: ${{ secrets.HF_TOKEN }}
-  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
-
-jobs:
-  bmk_h100:
-    if: ${{ inputs.runner == 'h100' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'h100-cr_0'
-          - 'h100-cr_1'
-          - 'h100-cw_0'
-          - 'h100-cw_1'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk_h200:
-    if: ${{ inputs.runner == 'h200' || inputs.runner == 'h200-trt' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'h200-cw_0'
-          - 'h200-cw_1'
-          - 'h200-nb_0'
-          - 'h200-nb_1'
-          - 'h200-nb_2'
-          - 'h200-nb_3'
-          - 'h200-nv_0'
-          - 'h200-nv_1'
-          - 'h200-nv_2'
-          - 'h200-nv_3'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[4]'
-      conc-list: '[64]'
-
-  bmk_b200:
-    if: ${{ inputs.runner == 'b200' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'b200-nv_0'
-          - 'b200-nv_1'
-          - 'b200-nvd_0'
-          - 'b200-nvd_1'
-          - 'b200-tg_0'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk_b200-trt:
-    if: ${{ inputs.runner == 'b200-trt' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'b200-nv_0'
-          - 'b200-nv_1'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk_mi300x:
-    if: ${{ inputs.runner == 'mi300x' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'mi300x-amd_0'
-          - 'mi300x-amd_1'
-          - 'mi300x-amd_2'
-          - 'mi300x-amd_3'
-          - 'mi300x-amd_4'
-          - 'mi300x-cr_0'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk_mi325x:
-    if: ${{ inputs.runner == 'mi325x' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'mi325x-amd_0'
-          - 'mi325x-tw_0'
-          - 'mi325x-tw_1'
-          - 'mi325x-tw_2'
-          - 'mi325x-tw_3'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk_mi355x:
-    if: ${{ inputs.runner == 'mi355x' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'mi355x-amd_0'
-          - 'mi355x-amd_1'
-          - 'mi355x-amd_2'
-          - 'mi355x-amd_3'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk_gb200:
-    if: ${{ inputs.runner == 'gb200' && inputs.framework == 'trt' }}
-    uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-    secrets: inherit
-    with:
-      runner: gb200
-      image: 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3'
-      model: 'deepseek-r1-fp4'
-      framework: 'dynamo-trtllm'
-      precision: 'fp4'
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      mtp-mode: 'off'
-
-  bmk_gb200-sgl:
-    if: ${{ inputs.runner == 'gb200' && inputs.framework == 'sglang' }}
-    uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-    secrets: inherit
-    with:
-      runner: gb200
-      image: 'nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      framework: 'dynamo-sglang'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: 8192
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      mtp-mode: 'off'
-
-  collect-test-results:
-    needs: [ bmk_h100, bmk_h200, bmk_b200, bmk_b200-trt, bmk_mi300x, bmk_mi325x, bmk_mi355x, bmk_gb200, bmk_gb200-sgl ]
-    if: ${{ always() && !cancelled() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
diff --git a/.github/workflows/runner-test.yml b/.github/workflows/runner-test.yml
deleted file mode 100644
index 983394035..000000000
--- a/.github/workflows/runner-test.yml
+++ /dev/null
@@ -1,136 +0,0 @@
-name: Test - Runner
-run-name: '${{ github.event.inputs.runner }} - ${{ github.event.inputs.model }}'
-on:
-  workflow_dispatch:
-    inputs:
-      runner:
-        description: 'Runner'
-        required: true
-        type: choice
-        options:
-          - 'h100-cr_0'
-          - 'h100-cr_1'
-          - 'h100-cw_0'
-          - 'h100-cw_1'
-          - 'h200-cw_0'
-          - 'h200-cw_1'
-          - 'h200-nb_0'
-          - 'h200-nb_1'
-          - 'h200-nb_2'
-          - 'h200-nb_3'
-          - 'h200-nv_0'
-          - 'h200-nv_1'
-          - 'h200-nv_2'
-          - 'h200-nv_3'
-          - 'b200-nv_0'
-          - 'b200-nv_1'
-          - 'b200-nb_0'
-          - 'b200-nb_1'
-          - 'b200-nvd_0'
-          - 'b200-nvd_1'
-          - 'b200-nvd_2'
-          - 'b200-nvd_3'
-          - 'b200-tg_0'
-          - 'mi300x-amd_0'
-          - 'mi300x-amd_1'
-          - 'mi300x-amd_2'
-          - 'mi300x-amd_3'
-          - 'mi300x-amd_4'
-          - 'mi300x-cr_0'
-          - 'mi300x-oci_0'
-          - 'mi325x-amd_0'
-          - 'mi325x-tw_0'
-          - 'mi325x-tw_1'
-          - 'mi325x-tw_2'
-          - 'mi325x-tw_3'
-          - 'mi355x-amd_0'
-          - 'mi355x-amd_1'
-          - 'mi355x-amd_2'
-          - 'mi355x-amd_3'
-
-      image:
-        description: 'Docker Image'
-        required: true
-        type: choice
-        options:
-          - 'lmsysorg/sglang:v0.4.9.post1-cu126'
-          - 'lmsysorg/sglang:v0.5.0rc1-cu128-b200'
-          - 'lmsysorg/sglang:v0.5.2rc2-cu126'
-          - 'lmsysorg/sglang:v0.5.3rc1-cu129-b200'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev'
-          - 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_vllm_0.10.1_instinct_rc1'
-          - 'rocm/7.0-preview:rocm7.0_preview_ubuntu_22.04_sgl-dev-v0.5.2rc2-mi30x_rc1'
-          - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915'
-          - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915'
-          - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250915'
-          - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-          - 'vllm/vllm-openai:v0.10.2'
-      model:
-        description: 'Model'
-        required: true
-        type: choice
-        options:
-          - 'amd/DeepSeek-R1-0528-MXFP4-Preview'
-          - 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-          - 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview'
-          - 'deepseek-ai/DeepSeek-R1-0528'
-          - 'nvidia/Llama-3.3-70B-Instruct-FP8'
-          - 'nvidia/Llama-3.3-70B-Instruct-FP4'
-          - 'nvidia/DeepSeek-R1-0528-FP4'
-          - 'nvidia/DeepSeek-R1-0528-FP4-v2'
-          - 'openai/gpt-oss-120b'
-
-      framework:
-        description: 'Framework'
-        required: true
-        type: choice
-        options:
-          - 'vllm'
-          - 'sglang'
-          - 'trt'
-
-      precision:
-        description: 'Precision'
-        required: true
-        type: choice
-        options:
-          - 'fp8'
-          - 'fp4'
-
-      exp-name:
-        description: 'Experiment Name'
-        required: true
-        type: choice
-        options:
-          - '70b_test'
-          - 'dsr1_test'
-          - 'gptoss_test'
-        
-jobs:
-  runner-test:
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ inputs.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[4]'
-
-  collect-test-results:
-    needs: runner-test
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 000000000..03d36472a
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,2 @@
+**/__pycache__/**
+**/.coverage
\ No newline at end of file
diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 6d092eac8..7574579af 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -142,90 +142,13 @@ def validate_master_configs_structure(all_config_data):
 
 
 def generate_full_sweep(args, all_config_data):
-    """Generate full sweep configurations based on model prefix and sequence lengths.
+    """Generate full sweep configurations with optional filtering.
 
-    Assumes all_config_data has been validated by validate_config_structure().
-    """
-    isl, osl = seq_len_stoi[args.seq_lens]
-
-    matrix_values = []
-    for key, val in all_config_data.items():
-        # Filter by model prefix
-        if not key.startswith(args.model_prefix):
-            continue
-
-        seq_len_configs = val['seq-len-configs']
-        image = val['image']
-        model = val['model']
-        precision = val['precision']
-        framework = val['framework']
-        runner = val['runner']
-        # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name
-        # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME
-        model_code = key.split('-')[0]
-
-        # Check if this config has matching sequence lengths
-        matching_seq_config = None
-        for slq in seq_len_configs:
-            if slq['isl'] == isl and slq['osl'] == osl:
-                matching_seq_config = slq
-                break
-
-        if not matching_seq_config:
-            continue  # Skip this config if no matching sequence length
-
-        bmk_space = matching_seq_config['search-space']
-
-        for bmk in bmk_space:
-            tp = bmk['tp']
-            conc_start = bmk['conc-start']
-            conc_end = bmk['conc-end']
-            ep = bmk.get('ep')
-            dp_attn = bmk.get('dp-attn')
-
-            # Generate entries for each concurrency value in the range
-            conc = conc_start
-            while conc <= conc_end:
-                seq_len_str = seq_len_to_str(isl, osl)
-                entry = {
-                    'image': image,
-                    'model': model,
-                    'precision': precision,
-                    'framework': framework,
-                    'runner': runner,
-                    'isl': isl,
-                    'osl': osl,
-                    'tp': tp,
-                    'conc': conc,
-                    'max-model-len': isl + osl + 200,
-                    'ep': 1,  # Default
-                    'dp-attn': False,  # Default
-                    'exp-name': f"{model_code}_{seq_len_str}",
-                }
-
-                # Add optional fields if they exist
-                if ep is not None:
-                    entry['ep'] = ep
-                if dp_attn is not None:
-                    entry['dp-attn'] = dp_attn
-
-                matrix_values.append(entry)
-
-                if conc == conc_end:
-                    break
-                conc *= args.step_size
-                if conc > conc_end:
-                    conc = conc_end
-
-    return matrix_values
-
-
-def generate_filtered_sweep(args, all_config_data):
-    """Generate sweep configurations with filtering options.
-
-    Allows filtering by model prefix, precision, framework, runner type, and sequence lengths.
+    Supports filtering by model prefix, precision, framework, runner type, and sequence lengths.
     Supports test mode to only run highest TP with lowest concurrency.
 
+    All filters are optional - can generate sweeps for all configs or filter by specific criteria.
+
     Assumes all_config_data has been validated by validate_config_structure().
     """
     # Validate runner types if specified
@@ -754,86 +677,56 @@ def main():
         'full-sweep',
         parents=[parent_parser],
         add_help=False,
-        help='Generate full sweep configurations based on model prefix'
-    )
-    full_sweep_parser.add_argument(
-        '--seq-lens',
-        choices=list(seq_len_stoi.keys()),
-        required=True,
-        help=f"Sequence length configuration: {', '.join(seq_len_stoi.keys())}"
+        help='Generate full sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths'
     )
     full_sweep_parser.add_argument(
-        '--model-prefix',
-        required=True,
-        help='Model prefix to filter configurations'
-    )
-    full_sweep_parser.add_argument(
-        '--step-size',
-        type=int,
-        default=2,
-        help='Step size for concurrency values (default: 2)'
-    )
-    full_sweep_parser.add_argument(
-        '-h', '--help',
-        action='help',
-        help='Show this help message and exit'
-    )
-
-    # Subcommand: filtered-sweep
-    filtered_sweep_parser = subparsers.add_parser(
-        'filtered-sweep',
-        parents=[parent_parser],
-        add_help=False,
-        help='Generate sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths'
-    )
-    filtered_sweep_parser.add_argument(
         '--model-prefix',
         nargs='+',
         required=False,
         help='Model prefix(es) to filter configurations (optional, can specify multiple)'
     )
-    filtered_sweep_parser.add_argument(
+    full_sweep_parser.add_argument(
         '--precision',
         nargs='+',
         required=False,
         help='Precision(s) to filter by (e.g., fp4, fp8) (optional, can specify multiple)'
     )
-    filtered_sweep_parser.add_argument(
+    full_sweep_parser.add_argument(
         '--framework',
         nargs='+',
         required=False,
         help='Framework(s) to filter by (e.g., vllm, trt, sglang) (optional, can specify multiple)'
     )
-    filtered_sweep_parser.add_argument(
+    full_sweep_parser.add_argument(
         '--runner-type',
         nargs='+',
         required=False,
         help='Runner type(s) to filter by (e.g., h200, h100) (optional, can specify multiple)'
     )
-    filtered_sweep_parser.add_argument(
+    full_sweep_parser.add_argument(
         '--runner-config',
         required=False,
         help='Configuration file holding runner information (required if --runner-type is specified)'
     )
-    filtered_sweep_parser.add_argument(
+    full_sweep_parser.add_argument(
         '--seq-lens',
         nargs='+',
         choices=list(seq_len_stoi.keys()),
         required=False,
         help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
     )
-    filtered_sweep_parser.add_argument(
+    full_sweep_parser.add_argument(
         '--step-size',
         type=int,
         default=2,
         help='Step size for concurrency values (default: 2)'
     )
-    filtered_sweep_parser.add_argument(
+    full_sweep_parser.add_argument(
         '--test-mode',
         action='store_true',
         help='Test mode: only run highest TP with lowest concurrency for each matching config'
     )
-    filtered_sweep_parser.add_argument(
+    full_sweep_parser.add_argument(
         '-h', '--help',
         action='help',
         help='Show this help message and exit'
@@ -998,8 +891,6 @@ def main():
     # Route to appropriate function based on subcommand
     if args.command == 'full-sweep':
         matrix_values = generate_full_sweep(args, all_config_data)
-    elif args.command == 'filtered-sweep':
-        matrix_values = generate_filtered_sweep(args, all_config_data)
     elif args.command == 'test-config':
         matrix_values = generate_test_config(args, all_config_data)
     elif args.command == 'runner-model-sweep':
diff --git a/utils/matrix-logic/pytest.ini b/utils/matrix-logic/pytest.ini
new file mode 100644
index 000000000..c3cd9aac7
--- /dev/null
+++ b/utils/matrix-logic/pytest.ini
@@ -0,0 +1,12 @@
+[pytest]
+testpaths = .
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+addopts =
+    -v
+    --strict-markers
+    --tb=short
+markers =
+    slow: marks tests as slow (deselect with '-m "not slow"')
+    integration: marks tests as integration tests
diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py
new file mode 100644
index 000000000..36cb14cd7
--- /dev/null
+++ b/utils/matrix-logic/test_generate_sweep_configs.py
@@ -0,0 +1,1545 @@
+import pytest
+import yaml
+from unittest.mock import patch
+from generate_sweep_configs import (
+    validate_master_configs_structure,
+    validate_matrix_output,
+    seq_len_to_str,
+    generate_full_sweep,
+    generate_test_config,
+    generate_runner_model_sweep_config,
+    generate_runner_sweep_config,
+    generate_custom_test,
+    load_config_files,
+    main,
+    MatrixEntry,
+)
+
+
+# Fixtures for test config files
+@pytest.fixture
+def sample_master_config():
+    """Sample master config with valid entries."""
+    return {
+        "70b-fp8-vllm": {
+            "image": "vllm/vllm-openai:v0.10.2",
+            "model": "meta-llama/Llama-3-70b",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [
+                        {"tp": 4, "conc-start": 1, "conc-end": 4},
+                        {"tp": 8, "conc-start": 2, "conc-end": 8, "ep": 2, "dp-attn": True}
+                    ]
+                },
+                {
+                    "isl": 1024,
+                    "osl": 8192,
+                    "search-space": [
+                        {"tp": 8, "conc-start": 1, "conc-end": 2}
+                    ]
+                }
+            ]
+        },
+        "8b-fp4-trt": {
+            "image": "nvcr.io/nvidia/tritonserver:24.01",
+            "model": "meta-llama/Llama-3-8b",
+            "precision": "fp4",
+            "framework": "trt",
+            "runner": "h100",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [
+                        {"tp": 2, "conc-start": 4, "conc-end": 16}
+                    ]
+                }
+            ]
+        },
+        "gptoss-120b-fp8-vllm": {
+            "image": "vllm/vllm-openai:latest",
+            "model": "openai/gpt-oss-120b",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200-trt",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [
+                        {"tp": 8, "conc-start": 1, "conc-end": 4}
+                    ]
+                }
+            ]
+        }
+    }
+
+
+@pytest.fixture
+def sample_runner_config():
+    """Sample runner config."""
+    return {
+        "h200": ["h200-nv_1", "h200-nv_2"],
+        "h100": ["h100-aws_1"],
+        "h200-trt": ["h200-trt_1", "h200-trt_2", "h200-trt_3"]
+    }
+
+
+@pytest.fixture
+def temp_config_files(tmp_path, sample_master_config, sample_runner_config):
+    """Create temporary config files."""
+    master_file = tmp_path / "master.yaml"
+    runner_file = tmp_path / "runners.yaml"
+
+    with open(master_file, 'w') as f:
+        yaml.dump(sample_master_config, f)
+
+    with open(runner_file, 'w') as f:
+        yaml.dump(sample_runner_config, f)
+
+    return str(master_file), str(runner_file)
+
+
+@pytest.fixture
+def invalid_master_config():
+    """Master config with validation errors."""
+    return {
+        "missing-field": {
+            "image": "test:latest",
+            "model": "test/model",
+            # Missing precision, framework, runner, seq-len-configs
+        }
+    }
+
+
+# Tests for seq_len_to_str
+def test_seq_len_to_str_with_mapping():
+    """Test seq_len_to_str with known mappings."""
+    assert seq_len_to_str(1024, 1024) == "1k1k"
+    assert seq_len_to_str(1024, 8192) == "1k8k"
+    assert seq_len_to_str(8192, 1024) == "8k1k"
+
+
+def test_seq_len_to_str_without_mapping():
+    """Test seq_len_to_str fallback for unknown mappings."""
+    assert seq_len_to_str(2048, 4096) == "2048_4096"
+    assert seq_len_to_str(512, 512) == "512_512"
+
+
+# Tests for MatrixEntry validation
+def test_matrix_entry_valid():
+    """Test valid MatrixEntry."""
+    entry = {
+        "image": "test:latest",
+        "model": "test/model",
+        "precision": "fp8",
+        "framework": "vllm",
+        "runner": "h200",
+        "isl": 1024,
+        "osl": 1024,
+        "tp": 8,
+        "ep": 1,
+        "dp-attn": False,
+        "conc": 4,
+        "max-model-len": 2048,
+        "exp-name": "test_exp"
+    }
+    result = MatrixEntry(**entry)
+    assert result.image == "test:latest"
+    assert result.tp == 8
+
+
+def test_matrix_entry_missing_field():
+    """Test MatrixEntry with missing required field."""
+    entry = {
+        "image": "test:latest",
+        "model": "test/model",
+        # Missing other required fields
+    }
+    with pytest.raises(Exception):  # Pydantic ValidationError
+        MatrixEntry(**entry)
+
+
+def test_matrix_entry_wrong_type():
+    """Test MatrixEntry with wrong type."""
+    entry = {
+        "image": "test:latest",
+        "model": "test/model",
+        "precision": "fp8",
+        "framework": "vllm",
+        "runner": "h200",
+        "isl": "not_an_int",  # Wrong type
+        "osl": 1024,
+        "tp": 8,
+        "ep": 1,
+        "dp-attn": False,
+        "conc": 4,
+        "max-model-len": 2048,
+        "exp-name": "test_exp"
+    }
+    with pytest.raises(Exception):  # Pydantic ValidationError
+        MatrixEntry(**entry)
+
+
+def test_matrix_entry_extra_field():
+    """Test MatrixEntry with extra field (should be forbidden)."""
+    entry = {
+        "image": "test:latest",
+        "model": "test/model",
+        "precision": "fp8",
+        "framework": "vllm",
+        "runner": "h200",
+        "isl": 1024,
+        "osl": 1024,
+        "tp": 8,
+        "ep": 1,
+        "dp-attn": False,
+        "conc": 4,
+        "max-model-len": 2048,
+        "exp-name": "test_exp",
+        "extra-field": "should_fail"
+    }
+    with pytest.raises(Exception):  # Pydantic ValidationError
+        MatrixEntry(**entry)
+
+
+# Tests for validate_matrix_output
+def test_validate_matrix_output_valid():
+    """Test validate_matrix_output with valid entries."""
+    entries = [
+        {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "isl": 1024,
+            "osl": 1024,
+            "tp": 8,
+            "ep": 1,
+            "dp-attn": False,
+            "conc": 4,
+            "max-model-len": 2048,
+            "exp-name": "test_exp"
+        }
+    ]
+    result = validate_matrix_output(entries)
+    assert result == entries
+
+
+def test_validate_matrix_output_invalid():
+    """Test validate_matrix_output with invalid entry."""
+    entries = [
+        {
+            "image": "test:latest",
+            "model": "test/model",
+            # Missing required fields
+        }
+    ]
+    with pytest.raises(ValueError, match="Matrix entry at index 0 failed validation"):
+        validate_matrix_output(entries)
+
+
+def test_validate_matrix_output_multiple_entries():
+    """Test validate_matrix_output with multiple entries."""
+    entries = [
+        {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "isl": 1024,
+            "osl": 1024,
+            "tp": 8,
+            "ep": 1,
+            "dp-attn": False,
+            "conc": 4,
+            "max-model-len": 2048,
+            "exp-name": "test_exp"
+        },
+        {
+            "image": "test2:latest",
+            "model": "test2/model",
+            "precision": "fp4",
+            "framework": "trt",
+            "runner": "h100",
+            "isl": 1024,
+            "osl": 1024,
+            "tp": 4,
+            "ep": 2,
+            "dp-attn": True,
+            "conc": 8,
+            "max-model-len": 2048,
+            "exp-name": "test_exp2"
+        }
+    ]
+    result = validate_matrix_output(entries)
+    assert len(result) == 2
+
+
+# Tests for validate_master_configs_structure
+def test_validate_master_configs_structure_valid(sample_master_config):
+    """Test validation of valid master config."""
+    validate_master_configs_structure(sample_master_config)
+
+
+def test_validate_master_configs_structure_missing_field():
+    """Test validation with missing required field."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            # Missing other required fields
+        }
+    }
+    with pytest.raises(ValueError, match="Missing required field"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_wrong_type():
+    """Test validation with wrong field type."""
+    config = {
+        "test-key": {
+            "image": 123,  # Should be string
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": []
+        }
+    }
+    with pytest.raises(ValueError, match="must be str"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_empty_seq_len_configs():
+    """Test validation with empty seq-len-configs."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": []
+        }
+    }
+    with pytest.raises(ValueError, match="must be a non-empty list"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_invalid_search_space():
+    """Test validation with invalid search-space."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [
+                        {"tp": 8}  # Missing conc-start and conc-end
+                    ]
+                }
+            ]
+        }
+    }
+    with pytest.raises(ValueError, match="Missing 'conc-start'"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_missing_search_space():
+    """Test validation with missing search-space."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024
+                    # Missing search-space
+                }
+            ]
+        }
+    }
+    with pytest.raises(ValueError, match="Missing or invalid 'search-space'"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_search_space_not_list():
+    """Test validation with search-space not being a list."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": "not_a_list"
+                }
+            ]
+        }
+    }
+    with pytest.raises(ValueError, match="Missing or invalid 'search-space'"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_extra_fields_in_search_space():
+    """Test validation with extra fields in search-space."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [
+                        {
+                            "tp": 8,
+                            "conc-start": 1,
+                            "conc-end": 4,
+                            "invalid-field": "value"
+                        }
+                    ]
+                }
+            ]
+        }
+    }
+    with pytest.raises(ValueError, match="Extra fields"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_missing_isl():
+    """Test validation with missing isl."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "osl": 1024,
+                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}]
+                }
+            ]
+        }
+    }
+    with pytest.raises(ValueError, match="Missing 'isl'"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_wrong_isl_type():
+    """Test validation with wrong isl type."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": "not_int",
+                    "osl": 1024,
+                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}]
+                }
+            ]
+        }
+    }
+    with pytest.raises(ValueError, match="'isl' must be int"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_missing_osl():
+    """Test validation with missing osl."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}]
+                }
+            ]
+        }
+    }
+    with pytest.raises(ValueError, match="Missing 'osl'"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_wrong_osl_type():
+    """Test validation with wrong osl type."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": "not_int",
+                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}]
+                }
+            ]
+        }
+    }
+    with pytest.raises(ValueError, match="'osl' must be int"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_wrong_tp_type():
+    """Test validation with wrong tp type."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [{"tp": "not_int", "conc-start": 1, "conc-end": 4}]
+                }
+            ]
+        }
+    }
+    with pytest.raises(ValueError, match="'tp' must be int"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_wrong_conc_start_type():
+    """Test validation with wrong conc-start type."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [{"tp": 8, "conc-start": "not_int", "conc-end": 4}]
+                }
+            ]
+        }
+    }
+    with pytest.raises(ValueError, match="'conc-start' must be int"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_wrong_conc_end_type():
+    """Test validation with wrong conc-end type."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": "not_int"}]
+                }
+            ]
+        }
+    }
+    with pytest.raises(ValueError, match="'conc-end' must be int"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_wrong_ep_type():
+    """Test validation with wrong ep type."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4, "ep": "not_int"}]
+                }
+            ]
+        }
+    }
+    with pytest.raises(ValueError, match="'ep' must be int"):
+        validate_master_configs_structure(config)
+
+
+def test_validate_master_configs_structure_wrong_dp_attn_type():
+    """Test validation with wrong dp-attn type."""
+    config = {
+        "test-key": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4, "dp-attn": "not_bool"}]
+                }
+            ]
+        }
+    }
+    with pytest.raises(ValueError, match="'dp-attn' must be bool"):
+        validate_master_configs_structure(config)
+
+
+# Tests for load_config_files
+def test_load_config_files_valid(temp_config_files):
+    """Test loading valid config files."""
+    master_file, _ = temp_config_files
+    result = load_config_files([master_file])
+    assert len(result) == 3
+    assert "70b-fp8-vllm" in result
+
+
+def test_load_config_files_multiple(tmp_path, sample_master_config):
+    """Test loading multiple config files."""
+    file1 = tmp_path / "config1.yaml"
+    file2 = tmp_path / "config2.yaml"
+
+    config1 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]}
+    config2 = {"8b-fp4-trt": sample_master_config["8b-fp4-trt"]}
+
+    with open(file1, 'w') as f:
+        yaml.dump(config1, f)
+    with open(file2, 'w') as f:
+        yaml.dump(config2, f)
+
+    result = load_config_files([str(file1), str(file2)])
+    assert len(result) == 2
+
+
+def test_load_config_files_not_found():
+    """Test loading non-existent config file."""
+    with pytest.raises(ValueError, match="does not exist"):
+        load_config_files(["/nonexistent/file.yaml"])
+
+
+def test_load_config_files_duplicate_keys(tmp_path, sample_master_config):
+    """Test loading files with duplicate keys."""
+    file1 = tmp_path / "config1.yaml"
+    file2 = tmp_path / "config2.yaml"
+
+    config1 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]}
+    config2 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]}  # Duplicate
+
+    with open(file1, 'w') as f:
+        yaml.dump(config1, f)
+    with open(file2, 'w') as f:
+        yaml.dump(config2, f)
+
+    with pytest.raises(ValueError, match="Duplicate configuration keys"):
+        load_config_files([str(file1), str(file2)])
+
+
+# Tests for generate_full_sweep
+def test_generate_full_sweep_basic(sample_master_config, temp_config_files):
+    """Test basic full sweep generation."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["70b"]
+        seq_lens = ["1k1k"]
+        step_size = 2
+        precision = None
+        framework = None
+        runner_type = None
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    assert len(result) > 0
+    assert all(entry['exp-name'].startswith('70b_1k1k') for entry in result)
+    assert all(entry['isl'] == 1024 and entry['osl'] == 1024 for entry in result)
+
+
+def test_generate_full_sweep_with_optionals(sample_master_config, temp_config_files):
+    """Test full sweep with optional ep and dp-attn."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["70b"]
+        seq_lens = ["1k1k"]
+        step_size = 2
+        precision = None
+        framework = None
+        runner_type = None
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    # Find entry with tp=8 which should have ep=2 and dp-attn=True
+    tp8_entries = [e for e in result if e['tp'] == 8]
+    assert len(tp8_entries) > 0
+    assert all(e['ep'] == 2 for e in tp8_entries)
+    assert all(e['dp-attn'] == True for e in tp8_entries)
+
+
+def test_generate_full_sweep_no_matches(sample_master_config, temp_config_files):
+    """Test full sweep with no matching configs."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["nonexistent"]
+        seq_lens = ["1k1k"]
+        step_size = 2
+        precision = None
+        framework = None
+        runner_type = None
+        test_mode = False
+        runner_config = runner_file
+
+    with pytest.raises(ValueError, match="No configs found matching filters"):
+        generate_full_sweep(Args(), sample_master_config)
+
+
+def test_generate_full_sweep_different_seq_len(sample_master_config, temp_config_files):
+    """Test full sweep with different sequence length."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["70b"]
+        seq_lens = ["1k8k"]
+        step_size = 2
+        precision = None
+        framework = None
+        runner_type = None
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    assert len(result) > 0
+    assert all(entry['isl'] == 1024 and entry['osl'] == 8192 for entry in result)
+
+
+def test_generate_full_sweep_step_size(sample_master_config, temp_config_files):
+    """Test full sweep with different step size."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["8b"]
+        seq_lens = ["1k1k"]
+        step_size = 4
+        precision = None
+        framework = None
+        runner_type = None
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    # Should have entries at conc=4, 8, 16 (step_size=4, conc-start=4, conc-end=16)
+    conc_values = sorted(set(e['conc'] for e in result))
+    assert 4 in conc_values
+    assert 16 in conc_values
+
+
+def test_generate_full_sweep_seq_len_not_in_config(temp_config_files):
+    """Test full sweep when requested seq-len is not in config."""
+    _, runner_file = temp_config_files
+
+    config = {
+        "test-fp8-vllm": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 8192,
+                    "osl": 1024,  # Only has 8k1k, not 1k1k
+                    "search-space": [
+                        {"tp": 4, "conc-start": 1, "conc-end": 4}
+                    ]
+                }
+            ]
+        }
+    }
+
+    class Args:
+        model_prefix = ["test"]
+        seq_lens = ["1k1k"]  # Requesting 1k1k but config only has 8k1k
+        step_size = 2
+        precision = None
+        framework = None
+        runner_type = None
+        test_mode = False
+        runner_config = runner_file
+
+    # Should raise error since no matching seq-len
+    with pytest.raises(ValueError, match="No configs found matching filters"):
+        generate_full_sweep(Args(), config)
+
+
+def test_generate_full_sweep_concurrency_overshoot(temp_config_files):
+    """Test full sweep when concurrency step overshoots end value."""
+    _, runner_file = temp_config_files
+
+    config = {
+        "test-fp8-vllm": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [
+                        {"tp": 4, "conc-start": 1, "conc-end": 5}  # 1, 3*2=6 overshoots, clamps to 5
+                    ]
+                }
+            ]
+        }
+    }
+
+    class Args:
+        model_prefix = ["test"]
+        seq_lens = ["1k1k"]
+        step_size = 3  # Will overshoot: 1, 3, 9 (clamped to 5)
+        precision = None
+        framework = None
+        runner_type = None
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), config)
+    conc_values = sorted(set(e['conc'] for e in result))
+    # Should have 1, 3, 5 (5 is the clamped value)
+    assert conc_values == [1, 3, 5]
+
+
+# Tests for generate_full_sweep with filters
+def test_generate_full_sweep_no_filters(sample_master_config, temp_config_files):
+    """Test filtered sweep with no filters."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = None
+        precision = None
+        framework = None
+        runner_type = None
+        seq_lens = None
+        step_size = 2
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    assert len(result) > 0
+
+
+def test_generate_full_sweep_with_filters_model_prefix(sample_master_config, temp_config_files):
+    """Test filtered sweep with model prefix filter."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["70b"]
+        precision = None
+        framework = None
+        runner_type = None
+        seq_lens = None
+        step_size = 2
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    assert all("70b" in entry['exp-name'] for entry in result)
+
+
+def test_generate_full_sweep_with_filters_multiple_filters(sample_master_config, temp_config_files):
+    """Test filtered sweep with multiple filters."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["70b"]
+        precision = ["fp8"]
+        framework = ["vllm"]
+        runner_type = None
+        seq_lens = ["1k1k"]
+        step_size = 2
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    assert len(result) > 0
+    assert all(entry['precision'] == 'fp8' for entry in result)
+    assert all(entry['framework'] == 'vllm' for entry in result)
+
+
+def test_generate_full_sweep_with_filters_test_mode(sample_master_config, temp_config_files):
+    """Test filtered sweep in test mode."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["70b"]
+        precision = None
+        framework = None
+        runner_type = None
+        seq_lens = ["1k1k"]
+        step_size = 2
+        test_mode = True
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    # In test mode, should only get one entry per seq-len (highest TP, lowest conc)
+    assert len(result) == 1  # Only one config matches 70b with 1k1k
+    assert result[0]['tp'] == 8  # Highest TP
+    assert '70b_1k1k' in result[0]['exp-name']
+
+
+def test_generate_full_sweep_with_filters_runner_type_validation(sample_master_config, temp_config_files):
+    """Test filtered sweep with invalid runner type."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = None
+        precision = None
+        framework = None
+        runner_type = ["invalid-runner"]
+        seq_lens = None
+        step_size = 2
+        test_mode = False
+        runner_config = runner_file
+
+    with pytest.raises(ValueError, match="Invalid runner type"):
+        generate_full_sweep(Args(), sample_master_config)
+
+
+def test_generate_full_sweep_with_filters_runner_type_no_config(sample_master_config):
+    """Test filtered sweep with runner type but no config file."""
+    class Args:
+        model_prefix = None
+        precision = None
+        framework = None
+        runner_type = ["h200"]
+        seq_lens = None
+        step_size = 2
+        test_mode = False
+        runner_config = None
+
+    with pytest.raises(ValueError, match="runner-config is required"):
+        generate_full_sweep(Args(), sample_master_config)
+
+
+def test_generate_full_sweep_with_filters_multiple_runner_types(sample_master_config, temp_config_files):
+    """Test filtered sweep with multiple runner types."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = None
+        precision = None
+        framework = None
+        runner_type = ["h200", "h100"]
+        seq_lens = ["1k1k"]
+        step_size = 2
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    runners = set(entry['runner'] for entry in result)
+    assert 'h200' in runners or 'h100' in runners
+
+
+def test_generate_full_sweep_with_filters_no_matches(sample_master_config, temp_config_files):
+    """Test filtered sweep with no matching configs."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["nonexistent"]
+        precision = None
+        framework = None
+        runner_type = None
+        seq_lens = None
+        step_size = 2
+        test_mode = False
+        runner_config = runner_file
+
+    with pytest.raises(ValueError, match="No configs found matching filters"):
+        generate_full_sweep(Args(), sample_master_config)
+
+
+def test_generate_full_sweep_with_filters_concurrency_overshoot(temp_config_files):
+    """Test filtered sweep when concurrency step overshoots end value."""
+    _, runner_file = temp_config_files
+
+    config = {
+        "test-fp8-vllm": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [
+                        {"tp": 4, "conc-start": 2, "conc-end": 7}  # 2, 8 overshoots, clamps to 7
+                    ]
+                }
+            ]
+        }
+    }
+
+    class Args:
+        model_prefix = None
+        precision = None
+        framework = None
+        runner_type = None
+        seq_lens = None
+        step_size = 4  # Will overshoot: 2, 8 (clamped to 7)
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), config)
+    conc_values = sorted(set(e['conc'] for e in result))
+    # Should have 2, 7 (7 is the clamped value)
+    assert 2 in conc_values
+    assert 7 in conc_values
+
+
+# Tests for generate_test_config
+def test_generate_test_config_basic(sample_master_config, temp_config_files):
+    """Test basic test config generation."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        key = "70b-fp8-vllm"
+        runner_config = runner_file
+        runner_node = "h200-nv_1"
+        seq_lens = None
+        step_size = 2
+        test_mode = False
+
+    result = generate_test_config(Args(), sample_master_config)
+    assert len(result) > 0
+
+
+def test_generate_test_config_test_mode(sample_master_config, temp_config_files):
+    """Test test config in test mode."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        key = "70b-fp8-vllm"
+        runner_config = runner_file
+        runner_node = "h200-nv_1"
+        seq_lens = ["1k1k"]
+        step_size = 2
+        test_mode = True
+
+    result = generate_test_config(Args(), sample_master_config)
+    # In test mode, should only use lowest concurrency
+    assert all(entry['conc'] == 1 or entry['conc'] == 2 for entry in result)
+
+
+def test_generate_test_config_specific_runner_node(sample_master_config, temp_config_files):
+    """Test test config with specific runner node."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        key = "70b-fp8-vllm"
+        runner_config = runner_file
+        runner_node = "h200-nv_1"
+        seq_lens = None
+        step_size = 2
+        test_mode = False
+
+    result = generate_test_config(Args(), sample_master_config)
+    assert all(entry['runner'] == 'h200-nv_1' for entry in result)
+
+
+def test_generate_test_config_invalid_key(sample_master_config, temp_config_files):
+    """Test test config with invalid key."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        key = "nonexistent-key"
+        runner_config = runner_file
+        runner_node = None
+        seq_lens = None
+        step_size = 2
+        test_mode = False
+
+    with pytest.raises(ValueError, match="does not exist in config files"):
+        generate_test_config(Args(), sample_master_config)
+
+
+def test_generate_test_config_invalid_runner_node(sample_master_config, temp_config_files):
+    """Test test config with invalid runner node."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        key = "70b-fp8-vllm"
+        runner_config = runner_file
+        runner_node = "invalid-node"
+        seq_lens = None
+        step_size = 2
+        test_mode = False
+
+    with pytest.raises(ValueError, match="is not compatible"):
+        generate_test_config(Args(), sample_master_config)
+
+
+def test_generate_test_config_missing_runner_config(sample_master_config):
+    """Test test config with missing runner config file."""
+    class Args:
+        key = "70b-fp8-vllm"
+        runner_config = "/nonexistent/file.yaml"
+        runner_node = None
+        seq_lens = None
+        step_size = 2
+        test_mode = False
+
+    with pytest.raises(ValueError, match="does not exist"):
+        generate_test_config(Args(), sample_master_config)
+
+
+def test_generate_test_config_concurrency_overshoot(temp_config_files):
+    """Test test config when concurrency step overshoots end value."""
+    _, runner_file = temp_config_files
+
+    config = {
+        "test-fp8-vllm": {
+            "image": "test:latest",
+            "model": "test/model",
+            "precision": "fp8",
+            "framework": "vllm",
+            "runner": "h200",
+            "seq-len-configs": [
+                {
+                    "isl": 1024,
+                    "osl": 1024,
+                    "search-space": [
+                        {"tp": 4, "conc-start": 1, "conc-end": 6}
+                    ]
+                }
+            ]
+        }
+    }
+
+    class Args:
+        key = "test-fp8-vllm"
+        runner_config = runner_file
+        runner_node = "h200-nv_1"
+        seq_lens = None
+        step_size = 4  # Will overshoot: 1, 4, 16 (clamped to 6)
+        test_mode = False
+
+    result = generate_test_config(Args(), config)
+    conc_values = sorted(set(e['conc'] for e in result))
+    assert 1 in conc_values
+    assert 4 in conc_values
+    assert 6 in conc_values
+
+
+# Tests for generate_runner_model_sweep_config
+def test_generate_runner_model_sweep_config(sample_master_config, temp_config_files):
+    """Test runner-model sweep config generation."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        runner_type = "h200"
+        runner_config = runner_file
+
+    result = generate_runner_model_sweep_config(Args(), sample_master_config)
+    assert len(result) > 0
+    # Should have entries for each runner node under h200
+    runners = set(entry['runner'] for entry in result)
+    assert 'h200-nv_1' in runners
+    assert 'h200-nv_2' in runners
+
+
+def test_generate_runner_model_sweep_config_invalid_runner(sample_master_config, temp_config_files):
+    """Test runner-model sweep with invalid runner type."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        runner_type = "invalid-runner"
+        runner_config = runner_file
+
+    with pytest.raises(ValueError, match="does not exist in runner config"):
+        generate_runner_model_sweep_config(Args(), sample_master_config)
+
+
+# Tests for generate_runner_sweep_config
+def test_generate_runner_sweep_config(sample_master_config, temp_config_files):
+    """Test runner sweep config generation."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = "70b"
+        precision = None
+        framework = None
+        runner_config = runner_file
+
+    result = generate_runner_sweep_config(Args(), sample_master_config)
+    assert len(result) > 0
+
+
+def test_generate_runner_sweep_config_with_filters(sample_master_config, temp_config_files):
+    """Test runner sweep with precision and framework filters."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = "70b"
+        precision = "fp8"
+        framework = "vllm"
+        runner_config = runner_file
+
+    result = generate_runner_sweep_config(Args(), sample_master_config)
+    assert all(entry['precision'] == 'fp8' for entry in result)
+    assert all(entry['framework'] == 'vllm' for entry in result)
+
+
+def test_generate_runner_sweep_config_no_matches(sample_master_config, temp_config_files):
+    """Test runner sweep with no matching configs."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = "nonexistent"
+        precision = None
+        framework = None
+        runner_config = runner_file
+
+    with pytest.raises(ValueError, match="No configs found matching"):
+        generate_runner_sweep_config(Args(), sample_master_config)
+
+
+# Tests for generate_custom_test
+def test_generate_custom_test(temp_config_files):
+    """Test custom test generation."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        runner_label = "h200"
+        image = "vllm/vllm-openai:latest"
+        model = "test/model"
+        framework = "vllm"
+        precision = "fp8"
+        exp_name = "custom_test"
+        runner_config = runner_file
+
+    result = generate_custom_test(Args())
+    assert len(result) == 1
+    assert result[0]['image'] == "vllm/vllm-openai:latest"
+    assert result[0]['exp-name'] == "custom_test"
+
+
+def test_generate_custom_test_invalid_runner(temp_config_files):
+    """Test custom test with invalid runner label."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        runner_label = "invalid-runner"
+        image = "vllm/vllm-openai:latest"
+        model = "test/model"
+        framework = "vllm"
+        precision = "fp8"
+        exp_name = "custom_test"
+        runner_config = runner_file
+
+    with pytest.raises(ValueError, match="Unable to find specified runner label"):
+        generate_custom_test(Args())
+
+
+# Tests for main function
+def test_main_full_sweep(temp_config_files):
+    """Test main function with full-sweep command."""
+    master_file, _ = temp_config_files
+
+    test_args = [
+        "generate_sweep_configs.py",
+        "full-sweep",
+        "--config-files", master_file,
+        "--seq-lens", "1k1k",
+        "--model-prefix", "70b",
+        "--step-size", "2"
+    ]
+
+    with patch('sys.argv', test_args):
+        result = main()
+        assert len(result) > 0
+
+
+def test_main_full_sweep_with_filters(temp_config_files):
+    """Test main function with full-sweep command with filters."""
+    master_file, runner_file = temp_config_files
+
+    test_args = [
+        "generate_sweep_configs.py",
+        "full-sweep",
+        "--config-files", master_file,
+        "--runner-config", runner_file,
+        "--model-prefix", "70b",
+        "--precision", "fp8",
+        "--test-mode"
+    ]
+
+    with patch('sys.argv', test_args):
+        result = main()
+        assert len(result) > 0
+
+
+def test_main_test_config(temp_config_files):
+    """Test main function with test-config command."""
+    master_file, runner_file = temp_config_files
+
+    test_args = [
+        "generate_sweep_configs.py",
+        "test-config",
+        "--config-files", master_file,
+        "--runner-config", runner_file,
+        "--key", "70b-fp8-vllm",
+        "--runner-node", "h200-nv_1",
+        "--test-mode"
+    ]
+
+    with patch('sys.argv', test_args):
+        result = main()
+        assert len(result) > 0
+
+
+def test_main_runner_model_sweep(temp_config_files):
+    """Test main function with runner-model-sweep command."""
+    master_file, runner_file = temp_config_files
+
+    test_args = [
+        "generate_sweep_configs.py",
+        "runner-model-sweep",
+        "--config-files", master_file,
+        "--runner-config", runner_file,
+        "--runner-type", "h200"
+    ]
+
+    with patch('sys.argv', test_args):
+        result = main()
+        assert len(result) > 0
+
+
+def test_main_runner_sweep(temp_config_files):
+    """Test main function with runner-sweep command."""
+    master_file, runner_file = temp_config_files
+
+    test_args = [
+        "generate_sweep_configs.py",
+        "runner-sweep",
+        "--config-files", master_file,
+        "--runner-config", runner_file,
+        "--model-prefix", "70b"
+    ]
+
+    with patch('sys.argv', test_args):
+        result = main()
+        assert len(result) > 0
+
+
+def test_main_custom(temp_config_files):
+    """Test main function with custom command."""
+    master_file, runner_file = temp_config_files
+
+    test_args = [
+        "generate_sweep_configs.py",
+        "custom",
+        "--config-files", master_file,
+        "--runner-config", runner_file,
+        "--runner-label", "h200",
+        "--image", "test:latest",
+        "--model", "test/model",
+        "--framework", "vllm",
+        "--precision", "fp8",
+        "--exp-name", "custom_test"
+    ]
+
+    with patch('sys.argv', test_args):
+        result = main()
+        assert len(result) == 1
+
+
+def test_main_invalid_config_structure(tmp_path):
+    """Test main with invalid config structure."""
+    invalid_file = tmp_path / "invalid.yaml"
+    with open(invalid_file, 'w') as f:
+        yaml.dump({"key": {"image": "test"}}, f)  # Missing required fields
+
+    test_args = [
+        "generate_sweep_configs.py",
+        "full-sweep",
+        "--config-files", str(invalid_file),
+        "--seq-lens", "1k1k",
+        "--model-prefix", "test"
+    ]
+
+    with patch('sys.argv', test_args):
+        with pytest.raises(ValueError):
+            main()
+
+
+def test_main_validation_failure(temp_config_files, monkeypatch):
+    """Test main with validation failure on output."""
+    master_file, _ = temp_config_files
+
+    # Monkey patch validate_matrix_output to always fail
+    def mock_validate(entries):
+        raise ValueError("Validation failed")
+
+    monkeypatch.setattr('generate_sweep_configs.validate_matrix_output', mock_validate)
+
+    test_args = [
+        "generate_sweep_configs.py",
+        "full-sweep",
+        "--config-files", master_file,
+        "--seq-lens", "1k1k",
+        "--model-prefix", "70b"
+    ]
+
+    with patch('sys.argv', test_args):
+        with pytest.raises(ValueError, match="Validation failed"):
+            main()
+
+
+# Edge case tests
+def test_concurrency_step_reaches_exact_end(sample_master_config, temp_config_files):
+    """Test that concurrency stepping reaches exact end value."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["8b"]
+        seq_lens = ["1k1k"]
+        step_size = 2
+        precision = None
+        framework = None
+        runner_type = None
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    # conc-start=4, conc-end=16, step=2 should give 4,8,16
+    conc_values = sorted(set(e['conc'] for e in result))
+    assert 16 in conc_values
+
+
+def test_multiple_model_prefixes_filtered_sweep(sample_master_config, temp_config_files):
+    """Test filtered sweep with multiple model prefixes."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["70b", "8b"]
+        precision = None
+        framework = None
+        runner_type = None
+        seq_lens = ["1k1k"]
+        step_size = 2
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    exp_names = [e['exp-name'] for e in result]
+    assert any('70b' in name for name in exp_names)
+    assert any('8b' in name for name in exp_names)
+
+
+def test_seq_len_filter_multiple(sample_master_config, temp_config_files):
+    """Test filtering with multiple sequence lengths."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["70b"]
+        precision = None
+        framework = None
+        runner_type = None
+        seq_lens = ["1k1k", "1k8k"]
+        step_size = 2
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    seq_lens = set((e['isl'], e['osl']) for e in result)
+    assert (1024, 1024) in seq_lens
+    assert (1024, 8192) in seq_lens
+
+
+def test_default_ep_dp_attn_values(sample_master_config, temp_config_files):
+    """Test that default ep and dp-attn values are set correctly."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["8b"]
+        seq_lens = ["1k1k"]
+        step_size = 2
+        precision = None
+        framework = None
+        runner_type = None
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    # 8b config doesn't specify ep/dp-attn, so should use defaults
+    assert all(e['ep'] == 1 for e in result)
+    assert all(e['dp-attn'] == False for e in result)
+
+
+def test_max_model_len_calculation(sample_master_config, temp_config_files):
+    """Test that max-model-len is calculated correctly."""
+    _, runner_file = temp_config_files
+
+    class Args:
+        model_prefix = ["70b"]
+        seq_lens = ["1k8k"]
+        step_size = 2
+        precision = None
+        framework = None
+        runner_type = None
+        test_mode = False
+        runner_config = runner_file
+
+    result = generate_full_sweep(Args(), sample_master_config)
+    # isl=1024, osl=8192, so max-model-len should be 1024+8192+200=9416
+    assert all(e['max-model-len'] == 9416 for e in result)
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v", "--cov=generate_sweep_configs", "--cov-report=term-missing"])

From 5729c677cf3f55b3cf1dc536b04c57c41d4721ed Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 09:08:45 -0500
Subject: [PATCH 070/149] list tp ep dpa then conc

---
 utils/summarize.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/utils/summarize.py b/utils/summarize.py
index de8863c78..6d926255e 100644
--- a/utils/summarize.py
+++ b/utils/summarize.py
@@ -12,7 +12,7 @@
 results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['ep'], r['conc']))
 
 summary_header = f'''\
-| Hardware | Framework | Precision | TP | EP | Conc | DP Attention | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
+| Hardware | Framework | Precision | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
 | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\
 '''
 print(summary_header)
@@ -26,8 +26,8 @@
         f"| {precision.upper()} "
         f"| {result['tp']} "
         f"| {result['ep']} "
-        f"| {result['conc']} "
         f"| {result['dp_attention']} "
+        f"| {result['conc']} "
         f"| {(result['median_ttft'] * 1000):.4f} "
         f"| {(result['median_tpot'] * 1000):.4f} "
         f"| {result['median_e2el']:.4f} "

From 6edcc3ac6d0b0755a45f60a6236d134976a33f05 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 09:15:01 -0500
Subject: [PATCH 071/149] removing 70b stuff

---
 .github/configs/amd-master.yaml       | 116 ---------------
 .github/configs/nvidia-master.yaml    | 199 --------------------------
 .github/workflows/1k1k-sweep.yml      |  46 ------
 .github/workflows/1k8k-sweep.yml      |  45 ------
 .github/workflows/8k1k-sweep.yml      |  45 ------
 benchmarks/70b_fp4_b200_docker.sh     |  48 -------
 benchmarks/70b_fp4_b200_trt_docker.sh |  46 ------
 benchmarks/70b_fp4_b200_trt_slurm.sh  |  81 -----------
 benchmarks/70b_fp4_mi355x_docker.sh   |  55 -------
 benchmarks/70b_fp4_mi355x_slurm.sh    |  84 -----------
 benchmarks/70b_fp8_b200_docker.sh     |  46 ------
 benchmarks/70b_fp8_b200_trt_docker.sh |  46 ------
 benchmarks/70b_fp8_b200_trt_slurm.sh  |  81 -----------
 benchmarks/70b_fp8_h100_docker.sh     |  29 ----
 benchmarks/70b_fp8_h100_slurm.sh      |  60 --------
 benchmarks/70b_fp8_h200_slurm.sh      |  76 ----------
 benchmarks/70b_fp8_h200_trt_slurm.sh  |  76 ----------
 benchmarks/70b_fp8_mi300x_docker.sh   |  59 --------
 benchmarks/70b_fp8_mi300x_slurm.sh    |  92 ------------
 benchmarks/70b_fp8_mi325x_docker.sh   |  53 -------
 benchmarks/70b_fp8_mi325x_slurm.sh    |  92 ------------
 benchmarks/70b_fp8_mi355x_docker.sh   |  50 -------
 benchmarks/70b_fp8_mi355x_slurm.sh    |  75 ----------
 23 files changed, 1600 deletions(-)
 delete mode 100644 benchmarks/70b_fp4_b200_docker.sh
 delete mode 100644 benchmarks/70b_fp4_b200_trt_docker.sh
 delete mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh
 delete mode 100644 benchmarks/70b_fp4_mi355x_docker.sh
 delete mode 100644 benchmarks/70b_fp4_mi355x_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_b200_docker.sh
 delete mode 100644 benchmarks/70b_fp8_b200_trt_docker.sh
 delete mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh
 delete mode 100755 benchmarks/70b_fp8_h100_docker.sh
 delete mode 100644 benchmarks/70b_fp8_h100_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_h200_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_mi300x_docker.sh
 delete mode 100644 benchmarks/70b_fp8_mi300x_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_mi325x_docker.sh
 delete mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_mi355x_docker.sh
 delete mode 100644 benchmarks/70b_fp8_mi355x_slurm.sh

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 81c436366..55086d443 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1,119 +1,3 @@
-70b-fp4-mi355x-vllm:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
-  model: amd/Llama-3.3-70B-Instruct-MXFP4-Preview
-  runner: mi355x
-  precision: fp4
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-
-70b-fp8-mi300x-vllm:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
-  model: amd/Llama-3.3-70B-Instruct-FP8-KV
-  runner: mi300x
-  precision: fp8
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 64, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
-70b-fp8-mi325x-vllm:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
-  model: amd/Llama-3.3-70B-Instruct-FP8-KV
-  runner: mi325x
-  precision: fp8
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 32, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 64, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 16, conc-end: 64 }
-    - { tp: 2, conc-start: 4, conc-end: 32 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
-70b-fp8-mi355x-vllm:
-  image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
-  model: amd/Llama-3.3-70B-Instruct-FP8-KV
-  runner: mi355x
-  precision: fp8
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
 dsr1-fp4-mi355x-sgl:
   image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915
   model: amd/DeepSeek-R1-0528-MXFP4-Preview
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index fe9ef989d..9da1cd0f9 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1,202 +1,3 @@
-70b-fp4-b200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
-  model: nvidia/Llama-3.3-70B-Instruct-FP4
-  runner: b200-trt
-  precision: fp4
-  framework: trt
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 128, conc-end: 128 }
-    - { tp: 2, conc-start: 64, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 128, conc-end: 128 }
-    - { tp: 2, conc-start: 64, conc-end: 128 }
-    - { tp: 4, conc-start: 16, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 32, conc-end: 128 }
-    - { tp: 2, conc-start: 16, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-
-70b-fp4-b200-vllm:
-  image: vllm/vllm-openai:v0.10.2
-  model: nvidia/Llama-3.3-70B-Instruct-FP4
-  runner: b200
-  precision: fp4
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 16, conc-end: 64 }
-    - { tp: 2, conc-start: 16, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 32 }
-    - { tp: 8, conc-start: 4, conc-end: 8 }
-
-70b-fp8-b200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
-  model: nvidia/Llama-3.3-70B-Instruct-FP8
-  runner: b200-trt
-  precision: fp8
-  framework: trt
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 128, conc-end: 128 }
-    - { tp: 2, conc-start: 64, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 128, conc-end: 128 }
-    - { tp: 2, conc-start: 64, conc-end: 128 }
-    - { tp: 4, conc-start: 16, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 32, conc-end: 128 }
-    - { tp: 2, conc-start: 16, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
-
-70b-fp8-b200-vllm:
-  image: vllm/vllm-openai:v0.10.2
-  model: nvidia/Llama-3.3-70B-Instruct-FP8
-  runner: b200
-  precision: fp8
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 64, conc-end: 64 }
-    - { tp: 4, conc-start: 16, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, conc-start: 16, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-
-70b-fp8-h100-vllm:
-  image: vllm/vllm-openai:v0.10.2
-  model: nvidia/Llama-3.3-70B-Instruct-FP8
-  runner: h100
-  precision: fp8
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 2, conc-start: 64, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 2, conc-start: 64, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
-70b-fp8-h200-trt:
-  image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
-  model: nvidia/Llama-3.3-70B-Instruct-FP8
-  runner: h200-trt
-  precision: fp8
-  framework: trt
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 128, conc-end: 128 }
-    - { tp: 2, conc-start: 64, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 128, conc-end: 128 }
-    - { tp: 2, conc-start: 64, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 16, conc-end: 128 }
-    - { tp: 4, conc-start: 4, conc-end: 128 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
-
-70b-fp8-h200-vllm:
-  image: vllm/vllm-openai:v0.10.2
-  model: nvidia/Llama-3.3-70B-Instruct-FP8
-  runner: h200
-  precision: fp8
-  framework: vllm
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 32, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 1024
-    osl: 8192
-    search-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
-    - { tp: 2, conc-start: 64, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-  - isl: 8192
-    osl: 1024
-    search-space:
-    - { tp: 1, conc-start: 16, conc-end: 64 }
-    - { tp: 2, conc-start: 16, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
-
 dsr1-fp4-b200-sgl:
   image: lmsysorg/sglang:v0.5.3rc1-cu129-b200
   model: nvidia/DeepSeek-R1-0528-FP4-V2
diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 958fd73b9..cbdc490e2 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -11,20 +11,6 @@ on:
 #     - cron: '0 23 * * *'
 
 jobs:
-    get-70b-configs:
-        runs-on: ubuntu-latest
-        outputs:
-            search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }}
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - id: get-70b-configs
-              run: |
-                  pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix 70b)
-                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
-
     get-dsr1-configs:
         runs-on: ubuntu-latest
         outputs:
@@ -53,30 +39,6 @@ jobs:
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
-    benchmark-70b:
-        needs: get-70b-configs
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: 70b 1k1k
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }}
-        secrets: inherit
-        with:
-            exp-name: "70b_1k1k"
-            isl: 1024
-            osl: 1024
-            max-model-len: 2048
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-
     benchmark-dsr1:
         needs: get-dsr1-configs
         uses: ./.github/workflows/benchmark-tmpl.yml
@@ -125,14 +87,6 @@ jobs:
             dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
-    collect-70b-results:
-        needs: benchmark-70b
-        if: ${{ always() }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "70b_1k1k"
-
     collect-dsr1-results:
         needs: benchmark-dsr1
         if: ${{ always() }}
diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index 68fbac028..25fc3a362 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -11,19 +11,6 @@ on:
 #     - cron: '0 23 * * *'
 
 jobs:
-    get-70b-configs:
-        runs-on: ubuntu-latest
-        outputs:
-            search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }}
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - id: get-70b-configs
-              run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix 70b)
-                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
-
     get-dsr1-configs:
         runs-on: ubuntu-latest
         outputs:
@@ -50,30 +37,6 @@ jobs:
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
-    benchmark-70b:
-        needs: get-70b-configs
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: 70b 1k8k
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }}
-        secrets: inherit
-        with:
-            exp-name: "70b_1k8k"
-            isl: 1024
-            osl: 1024
-            max-model-len: 2048
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-
     benchmark-dsr1:
         needs: get-dsr1-configs
         uses: ./.github/workflows/benchmark-tmpl.yml
@@ -122,14 +85,6 @@ jobs:
             dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
-    collect-70b-results:
-        needs: benchmark-70b
-        if: ${{ always() }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "70b_1k8k"
-
     collect-dsr1-results:
         needs: benchmark-dsr1
         if: ${{ always() }}
diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml
index 7be91c4fb..c8338d533 100644
--- a/.github/workflows/8k1k-sweep.yml
+++ b/.github/workflows/8k1k-sweep.yml
@@ -11,19 +11,6 @@ on:
 #     - cron: '0 23 * * *'
 
 jobs:
-    get-70b-configs:
-        runs-on: ubuntu-latest
-        outputs:
-            search-space-config: ${{ steps.get-70b-configs.outputs.search-space-config }}
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - id: get-70b-configs
-              run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix 70b)
-                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
-
     get-dsr1-configs:
         runs-on: ubuntu-latest
         outputs:
@@ -50,30 +37,6 @@ jobs:
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
-    benchmark-70b:
-        needs: get-70b-configs
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: 70b 8k1k
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-70b-configs.outputs.search-space-config) }}
-        secrets: inherit
-        with:
-            exp-name: "70b_8k1k"
-            isl: 1024
-            osl: 1024
-            max-model-len: 2048
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-
     benchmark-dsr1:
         needs: get-dsr1-configs
         uses: ./.github/workflows/benchmark-tmpl.yml
@@ -122,14 +85,6 @@ jobs:
             dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
-    collect-70b-results:
-        needs: benchmark-70b
-        if: ${{ always() }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "70b_8k1k"
-
     collect-dsr1-results:
         needs: benchmark-dsr1
         if: ${{ always() }}
diff --git a/benchmarks/70b_fp4_b200_docker.sh b/benchmarks/70b_fp4_b200_docker.sh
deleted file mode 100644
index a76ffb9f8..000000000
--- a/benchmarks/70b_fp4_b200_docker.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-nvidia-smi
-
-# To improve CI stability, we patch this helper function to prevent a race condition that
-# happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779
-sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py
-
-# Calculate max-model-len based on ISL and OSL
-if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
-    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
-elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
-    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
-else
-    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
-fi
-
-cat > config.yaml << EOF
-kv-cache-dtype: fp8
-compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}'
-async-scheduling: true
-no-enable-prefix-caching: true
-max-num-batched-tokens: 8192
-max-model-len: $CALCULATED_MAX_MODEL_LEN
-EOF
-
-export TORCH_CUDA_ARCH_LIST="10.0"
-export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
-export PYTHONNOUSERSITE=1
-
-set -x
-vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
---disable-log-requests
diff --git a/benchmarks/70b_fp4_b200_trt_docker.sh b/benchmarks/70b_fp4_b200_trt_docker.sh
deleted file mode 100644
index e30478672..000000000
--- a/benchmarks/70b_fp4_b200_trt_docker.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-set -x
-# Launch TRT-LLM server
-mpirun -n 1 --allow-run-as-root --oversubscribe trtllm-serve $MODEL --tp_size $TP --trust_remote_code \
---max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT
diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh
deleted file mode 100644
index a480ca910..000000000
--- a/benchmarks/70b_fp4_b200_trt_slurm.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-hf download $MODEL
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-
-set -x
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-# Launch TRT-LLM server
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-        sleep 5
-        tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp4_mi355x_docker.sh b/benchmarks/70b_fp4_mi355x_docker.sh
deleted file mode 100644
index 681a629fb..000000000
--- a/benchmarks/70b_fp4_mi355x_docker.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# PORT
-# TP
-# CONC
-# MAX_MODEL_LEN
-
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-	export VLLM_ROCM_USE_AITER_MHA=0
-	if [[ "$CONC" -le "16" ]]; then
-		export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
-	else
-		export VLLM_TRITON_FP4_GEMM_USE_ASM=1
-	fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-	export VLLM_ROCM_USE_AITER_MHA=0
-	if [[ "$CONC" -le "16" ]]; then
-		export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
-	else
-		export VLLM_TRITON_FP4_GEMM_USE_ASM=1
-	fi
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-	if [[ "$CONC" -ge "16" ]]; then
-		export VLLM_ROCM_USE_AITER_MHA=1
-	else
-		export VLLM_ROCM_USE_AITER_MHA=0
-	fi
-	if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then
-		export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
-	else
-		export VLLM_TRITON_FP4_GEMM_USE_ASM=1
-	fi
-fi
-
-set -x
-vllm serve $MODEL \
---host=0.0.0.0 \
---port $PORT \
---swap-space 64 \
---max-model-len $MAX_MODEL_LEN \
---tensor-parallel-size $TP \
---max-num-seqs 1024 \
---kv-cache-dtype fp8 \
---gpu-memory-utilization 0.94 \
---max-seq-len-to-capture $MAX_MODEL_LEN \
---max-num-batched-tokens 131072 \
---no-enable-prefix-caching \
---disable-log-requests \
---async-scheduling
diff --git a/benchmarks/70b_fp4_mi355x_slurm.sh b/benchmarks/70b_fp4_mi355x_slurm.sh
deleted file mode 100644
index 0d5a469d0..000000000
--- a/benchmarks/70b_fp4_mi355x_slurm.sh
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# PORT
-# RESULT_FILENAME
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=8888
-
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=0
-        if [[ "$CONC" -le "16" ]]; then
-                export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
-        else
-                export VLLM_TRITON_FP4_GEMM_USE_ASM=1
-        fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=0
-        if [[ "$CONC" -le "16" ]]; then
-                export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
-        else
-                export VLLM_TRITON_FP4_GEMM_USE_ASM=1
-        fi
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-	if [[ "$CONC" -ge "16" ]]; then
-		export VLLM_ROCM_USE_AITER_MHA=1
-	else
-		export VLLM_ROCM_USE_AITER_MHA=0
-	fi
-	if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then
-		export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
-	else
-		export VLLM_TRITON_FP4_GEMM_USE_ASM=1
-	fi
-fi
-
-
-set -x
-vllm serve $MODEL \
---host=0.0.0.0 \
---port $PORT \
---swap-space 64 \
---max-model-len $MAX_MODEL_LEN \
---tensor-parallel-size $TP \
---max-num-seqs 1024 \
---kv-cache-dtype fp8 \
---gpu-memory-utilization 0.94 \
---max-seq-len-to-capture $MAX_MODEL_LEN \
---max-num-batched-tokens 131072 \
---no-enable-prefix-caching \
---disable-log-requests \
---async-scheduling > $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url "http://0.0.0.0:$PORT" \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics "ttft,tpot,itl,e2el" \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json
-
diff --git a/benchmarks/70b_fp8_b200_docker.sh b/benchmarks/70b_fp8_b200_docker.sh
deleted file mode 100644
index dbcfaf6fd..000000000
--- a/benchmarks/70b_fp8_b200_docker.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# PORT
-# TP
-# CONC
-# MAX_MODEL_LEN
-
-nvidia-smi
-
-# To improve CI stability, we patch this helper function to prevent a race condition that
-# happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779
-sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py
-
-
-
-FUSION_FLAG='{'\
-'"pass_config": {"enable_fi_allreduce_fusion": true, "enable_attn_fusion": true, "enable_noop": true},'\
-'"custom_ops": ["+quant_fp8", "+rms_norm"],'\
-'"cudagraph_mode": "FULL_DECODE_ONLY",'\
-'"splitting_ops": []'\
-'}'
-cat > config.yaml <<-EOF
-kv-cache-dtype: fp8
-compilation-config: '$FUSION_FLAG'
-async-scheduling: true
-no-enable-prefix-caching: true
-max-num-batched-tokens: 8192
-max-model-len: $MAX_MODEL_LEN
-EOF
-
-cat config.yaml  # Debugging
-
-export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
-export PYTHONNOUSERSITE=1
-
-set -x
-vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
---gpu-memory-utilization=0.9 \
---tensor-parallel-size=$TP \
---max-num-seqs=512 \
---config config.yaml \
---disable-log-requests
diff --git a/benchmarks/70b_fp8_b200_trt_docker.sh b/benchmarks/70b_fp8_b200_trt_docker.sh
deleted file mode 100644
index e30478672..000000000
--- a/benchmarks/70b_fp8_b200_trt_docker.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-set -x
-# Launch TRT-LLM server
-mpirun -n 1 --allow-run-as-root --oversubscribe trtllm-serve $MODEL --tp_size $TP --trust_remote_code \
---max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT
diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh
deleted file mode 100644
index a480ca910..000000000
--- a/benchmarks/70b_fp8_b200_trt_slurm.sh
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-hf download $MODEL
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-
-set -x
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-# Launch TRT-LLM server
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-        sleep 5
-        tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h100_docker.sh b/benchmarks/70b_fp8_h100_docker.sh
deleted file mode 100755
index 5d8df1bac..000000000
--- a/benchmarks/70b_fp8_h100_docker.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# MAX_MODEL_LEN
-# TP
-# CONC
-
-pip install -q datasets pandas
-
-cat > config.yaml << EOF
-kv-cache-dtype: fp8
-async-scheduling: true
-no-enable-prefix-caching: true
-max-num-batched-tokens: 8192
-max-model-len: 10240
-EOF
-
-export PYTHONNOUSERSITE=1
-
-vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
---config=config.yaml \
---gpu-memory-utilization=0.9 \
---tensor-parallel-size=$TP \
---max-num-seqs=$CONC  \
---disable-log-requests
diff --git a/benchmarks/70b_fp8_h100_slurm.sh b/benchmarks/70b_fp8_h100_slurm.sh
deleted file mode 100644
index 485aa8817..000000000
--- a/benchmarks/70b_fp8_h100_slurm.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-cat > config.yaml << EOF
-kv-cache-dtype: fp8
-async-scheduling: true
-no-enable-prefix-caching: true
-max-num-batched-tokens: 8192
-max-model-len: 10240
-EOF
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-
-export TORCH_CUDA_ARCH_LIST="9.0"
-
-set -x
-PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
---config=config.yaml \
---gpu-memory-utilization=0.9 \
---tensor-parallel-size=$TP \
---max-num-seqs=$CONC  \
---disable-log-requests > $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-pip install -q datasets pandas
-git clone https://github.com/kimbochen/bench_serving.git
-set -x
-python3 bench_serving/benchmark_serving.py \
---model=$MODEL \
---backend=vllm \
---base-url="http://0.0.0.0:$PORT" \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics='ttft,tpot,itl,e2el' \
---result-dir=/workspace/ \
---result-filename=$RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh
deleted file mode 100644
index 86eefd8ce..000000000
--- a/benchmarks/70b_fp8_h200_slurm.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-set -x
-hf download $MODEL
-pip install datasets pandas
-
-# Calculate max-model-len based on ISL and OSL
-if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
-    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
-elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
-    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
-else
-    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
-fi
-
-# Create config.yaml
-cat > config.yaml << EOF
-kv-cache-dtype: fp8
-async-scheduling: true
-no-enable-prefix-caching: true
-max-num-batched-tokens: 8192
-max-model-len: $CALCULATED_MAX_MODEL_LEN
-EOF
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-export TORCH_CUDA_ARCH_LIST="9.0"
-
-PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
- --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC  \
- --disable-log-requests > $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    # Ignore intel_extension_for_pytorch import errors
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]] && [[ ! "$line" =~ "intel_extension_for_pytorch" ]]; then
-		sleep 5
-		tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh
deleted file mode 100644
index 28112196f..000000000
--- a/benchmarks/70b_fp8_h200_trt_slurm.sh
+++ /dev/null
@@ -1,76 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-hf download $MODEL
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-        sleep 5
-        tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
\ No newline at end of file
diff --git a/benchmarks/70b_fp8_mi300x_docker.sh b/benchmarks/70b_fp8_mi300x_docker.sh
deleted file mode 100644
index 941e95023..000000000
--- a/benchmarks/70b_fp8_mi300x_docker.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/usr/bin/env bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# PORT
-# TP
-# CONC
-# MAX_MODEL_LEN
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-# If the machine runs a MEC FW older than 177, RCCL
-# cannot reclaim some memory.
-# Disable that features to avoid crashes.
-# This is related to the changes in the driver at:
-# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
-if [[ "$version" == "" || $version -lt 177 ]]; then
-  export HSA_NO_SCRATCH_RECLAIM=1
-fi
-
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ "$CONC" -ge "16" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-    fi
-fi
-
-# In this specific case, float16 performs better than the datatype
-# picked by vllm when using auto for --dtype (bfloat16).
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=float16 --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests
diff --git a/benchmarks/70b_fp8_mi300x_slurm.sh b/benchmarks/70b_fp8_mi300x_slurm.sh
deleted file mode 100644
index b387505f0..000000000
--- a/benchmarks/70b_fp8_mi300x_slurm.sh
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-huggingface-cli download $MODEL
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=8888
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-# If the machine runs a MEC FW older than 177, RCCL
-# cannot reclaim some memory.
-# Disable that features to avoid crashes.
-# This is related to the changes in the driver at:
-# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
-if [[ "$version" == "" || $version -lt 177 ]]; then
-  export HSA_NO_SCRATCH_RECLAIM=1
-fi
-
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ "$CONC" -ge "16" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-    fi
-fi
-
-# In this specific case, float16 performs better than the datatype
-# picked by vllm when using auto for --dtype (bfloat16).
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=float16 --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests \
-> $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model=$MODEL --backend=vllm \
---base-url="http://0.0.0.0:$PORT" \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics='ttft,tpot,itl,e2el' \
---result-dir=/workspace/ \
---result-filename=$RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_mi325x_docker.sh b/benchmarks/70b_fp8_mi325x_docker.sh
deleted file mode 100644
index 9e1fcdf8b..000000000
--- a/benchmarks/70b_fp8_mi325x_docker.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# PORT
-# TP
-# CONC
-# MAX_MODEL_LEN
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ "$CONC" -ge "16" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-    fi
-fi
-
-# Patch the aiter config script to deal
-# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch.
-file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py'
-sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#'  $file_to_patch
-
-
-# In this specific case, float16 performs better than the datatype
-# picked by vllm when using auto for --dtype (bfloat16).
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=float16 --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests
diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh
deleted file mode 100644
index 105ba7185..000000000
--- a/benchmarks/70b_fp8_mi325x_slurm.sh
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-huggingface-cli download $MODEL
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-	if [[ "$CONC" -ge "16" ]]; then
-		export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-	fi
-fi
-
-# Patch the aiter config script to deal
-# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch.
-file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py'
-sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#'  $file_to_patch
-
-
-# In this specific case, float16 performs better than the datatype
-# picked by vllm when using auto for --dtype (bfloat16).
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=float16 --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests \
-> $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-		sleep 5
-		tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_mi355x_docker.sh b/benchmarks/70b_fp8_mi355x_docker.sh
deleted file mode 100644
index 6310a5f64..000000000
--- a/benchmarks/70b_fp8_mi355x_docker.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# PORT
-# TP
-# CONC
-# MAX_MODEL_LEN
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-sleep 5
-cat config.yaml
-
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ "$CONC" -ge "16" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-    fi
-fi
-
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=auto --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests
-
diff --git a/benchmarks/70b_fp8_mi355x_slurm.sh b/benchmarks/70b_fp8_mi355x_slurm.sh
deleted file mode 100644
index 2abfee137..000000000
--- a/benchmarks/70b_fp8_mi355x_slurm.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# PORT
-# RESULT_FILENAME
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ "$CONC" -ge "16" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-    fi
-fi
-
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=auto --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests \
-> $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url "http://0.0.0.0:$PORT" \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics "ttft,tpot,itl,e2el" \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json
-
-exit

From 69844b2eadc804093fab94a1359aee8c21aaf4b5 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 17:37:58 -0500
Subject: [PATCH 072/149] temp fix (#148)

---
 benchmarks/70b_fp4_b200_trt_slurm.sh    | 75 +++++++++++++++++++++
 benchmarks/70b_fp8_b200_trt_slurm.sh    | 75 +++++++++++++++++++++
 benchmarks/70b_fp8_h200_slurm.sh        | 69 ++++++++++++++++++++
 benchmarks/70b_fp8_h200_trt_slurm.sh    | 70 ++++++++++++++++++++
 benchmarks/70b_fp8_mi325x_slurm.sh      | 86 +++++++++++++++++++++++++
 benchmarks/dsr1_fp4_b200_trt_slurm.sh   |  6 --
 benchmarks/dsr1_fp8_b200_trt_slurm.sh   |  6 --
 benchmarks/dsr1_fp8_h200_slurm.sh       |  6 --
 benchmarks/dsr1_fp8_h200_trt_slurm.sh   |  6 --
 benchmarks/dsr1_fp8_mi325x_slurm.sh     |  6 --
 benchmarks/gptoss_fp4_h200_slurm.sh     |  7 --
 benchmarks/gptoss_fp4_h200_trt_slurm.sh |  6 --
 12 files changed, 375 insertions(+), 43 deletions(-)
 create mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh
 create mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh
 create mode 100644 benchmarks/70b_fp8_h200_slurm.sh
 create mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh
 create mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh

diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh
new file mode 100644
index 000000000..ad24453b3
--- /dev/null
+++ b/benchmarks/70b_fp4_b200_trt_slurm.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+
+set -x
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh
new file mode 100644
index 000000000..ad24453b3
--- /dev/null
+++ b/benchmarks/70b_fp8_b200_trt_slurm.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+
+set -x
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh
new file mode 100644
index 000000000..094fbd19c
--- /dev/null
+++ b/benchmarks/70b_fp8_h200_slurm.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+set -x
+hf download $MODEL
+pip install datasets pandas
+
+# Calculate max-model-len based on ISL and OSL
+if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
+    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
+elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
+    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
+else
+    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
+fi
+
+# Create config.yaml
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+async-scheduling: true
+no-enable-prefix-caching: true
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+
+PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
+ --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC  \
+ --disable-log-requests > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend vllm \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh
new file mode 100644
index 000000000..dfb2324b9
--- /dev/null
+++ b/benchmarks/70b_fp8_h200_trt_slurm.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
\ No newline at end of file
diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh
new file mode 100644
index 000000000..1febeff13
--- /dev/null
+++ b/benchmarks/70b_fp8_mi325x_slurm.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+huggingface-cli download $MODEL
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+# Reference
+# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
+
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
+if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+	if [[ "$CONC" -ge "16" ]]; then
+		export VLLM_ROCM_USE_AITER_MHA=1
+    else
+		export VLLM_ROCM_USE_AITER_MHA=0
+	fi
+fi
+
+# Patch the aiter config script to deal
+# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch.
+file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py'
+sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#'  $file_to_patch
+
+
+# In this specific case, float16 performs better than the datatype
+# picked by vllm when using auto for --dtype (bfloat16).
+set -x
+vllm serve $MODEL --port=$PORT \
+--swap-space=64 \
+--gpu-memory-utilization=0.94 \
+--dtype=float16 --kv-cache-dtype=fp8 \
+--distributed-executor-backend=mp --tensor-parallel-size=$TP \
+--max-model-len=$MAX_MODEL_LEN \
+--max-seq-len-to-capture=$MAX_MODEL_LEN \
+--max-num-seqs=$CONC \
+--max-num-batched-tokens=131072 \
+--no-enable-prefix-caching \
+--config config.yaml \
+--async-scheduling \
+--disable-log-requests \
+> $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend vllm \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index ababfa150..6f4f814a0 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -104,12 +104,6 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
 set +x
 while IFS= read -r line; do
     printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-        sleep 5
-        tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
     if [[ "$line" == *"Application startup complete"* ]]; then
         break
     fi
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index 509cca7ba..58d4525f1 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -74,12 +74,6 @@ mpirun -n 1 --oversubscribe --allow-run-as-root \
 set +x
 while IFS= read -r line; do
     printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-        sleep 5
-        tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
     if [[ "$line" == *"Application startup complete"* ]]; then
         break
     fi
diff --git a/benchmarks/dsr1_fp8_h200_slurm.sh b/benchmarks/dsr1_fp8_h200_slurm.sh
index 765cf7dcd..74a005a78 100644
--- a/benchmarks/dsr1_fp8_h200_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_slurm.sh
@@ -47,12 +47,6 @@ fi
 set +x
 while IFS= read -r line; do
     printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-		sleep 5
-		tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
     if [[ "$line" == *"Application startup complete"* ]]; then
         break
     fi
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 174d67b53..7b566c0ab 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -74,12 +74,6 @@ PYTHONNOUSERSITE=1 mpirun -n 1 --oversubscribe --allow-run-as-root \
 set +x
 while IFS= read -r line; do
     printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-        sleep 5
-        tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
     if [[ "$line" == *"Application startup complete"* ]]; then
         break
     fi
diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh
index acbe78d08..d502093d8 100644
--- a/benchmarks/dsr1_fp8_mi325x_slurm.sh
+++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh
@@ -26,12 +26,6 @@ python3 -m sglang.launch_server \
 set +x
 while IFS= read -r line; do
     printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-        sleep 5
-        tail -n100 "$SERVER_LOG"
-        echo "JOB $SLURM_JOB_ID ran on $SLURMD_NODENAME"
-        exit 1
-    fi
     if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then
         break
     fi
diff --git a/benchmarks/gptoss_fp4_h200_slurm.sh b/benchmarks/gptoss_fp4_h200_slurm.sh
index 61bef8aaa..23ac0bfa1 100644
--- a/benchmarks/gptoss_fp4_h200_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_slurm.sh
@@ -50,13 +50,6 @@ PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config
 set +x
 while IFS= read -r line; do
     printf '%s\n' "$line"
-    # Ignore intel_extension_for_pytorch import errors
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]] && [[ ! "$line" =~ "intel_extension_for_pytorch" ]]; then
-		sleep 5
-		tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
     if [[ "$line" == *"Application startup complete"* ]]; then
         break
     fi
diff --git a/benchmarks/gptoss_fp4_h200_trt_slurm.sh b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
index 969d65310..c148a3cb7 100644
--- a/benchmarks/gptoss_fp4_h200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_h200_trt_slurm.sh
@@ -51,12 +51,6 @@ mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --max_batch_
 set +x
 while IFS= read -r line; do
     printf '%s\n' "$line"
-    if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
-        sleep 5
-        tail -n100 $SERVER_LOG
-        echo "JOB $SLURM_JOB_ID ran on NODE $SLURMD_NODENAME"
-        exit 1
-    fi
     if [[ "$line" == *"Application startup complete"* ]]; then
         break
     fi

From 1105aea6ae2ed4ff700505ee73bf49678b7ab7d1 Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Wed, 29 Oct 2025 22:01:12 -0400
Subject: [PATCH 073/149] remove: llama 70b

---
 .github/workflows/full-sweep-tmpl.yml         | 188 ++++++++++
 .github/workflows/runner-model-sweep-test.yml | 289 +++++++++++++++
 .github/workflows/runner-sweep-test.yml       | 328 ++++++++++++++++++
 benchmarks/70b_fp4_b200_trt_slurm.sh          |  75 ----
 benchmarks/70b_fp8_b200_trt_slurm.sh          |  75 ----
 benchmarks/70b_fp8_h200_slurm.sh              |  69 ----
 benchmarks/70b_fp8_h200_trt_slurm.sh          |  70 ----
 benchmarks/70b_fp8_mi325x_slurm.sh            |  86 -----
 8 files changed, 805 insertions(+), 375 deletions(-)
 create mode 100644 .github/workflows/full-sweep-tmpl.yml
 create mode 100644 .github/workflows/runner-model-sweep-test.yml
 create mode 100644 .github/workflows/runner-sweep-test.yml
 delete mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_h200_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh

diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml
new file mode 100644
index 000000000..869928cb7
--- /dev/null
+++ b/.github/workflows/full-sweep-tmpl.yml
@@ -0,0 +1,188 @@
+name: Template - Full Sweep
+
+on:
+  workflow_call:
+    inputs:
+      run_1k1k:
+        type: boolean
+        required: true
+      run_8k1k:
+        type: boolean
+        required: true
+      run_1k8k:
+        type: boolean
+        required: true
+
+      use_h100:
+        type: boolean
+        required: true
+      use_h200:
+        type: boolean
+        required: true
+      use_b200:
+        type: boolean
+        required: true
+      use_mi300x:
+        type: boolean
+        required: true
+      use_mi325x:
+        type: boolean
+        required: true
+      use_mi355x:
+        type: boolean
+        required: true
+      use_gb200:
+        type: boolean
+        required: false
+        default: false
+
+jobs:
+  dsr1-1k1k:
+    if: ${{ inputs.run_1k1k }}
+    uses: ./.github/workflows/dsr1-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: 'dsr1_1k1k'
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
+      use_gb200: ${{ inputs.use_gb200 }}
+
+  collect-dsr1-1k1k-results:
+    needs: dsr1-1k1k
+    if: ${{ inputs.run_1k1k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: 'dsr1_1k1k'
+
+  gptoss-1k1k:
+    if: ${{ inputs.run_1k1k }}
+    uses: ./.github/workflows/gptoss-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: 'gptoss_1k1k'
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      use_h100: ${{ inputs.use_h100 }}
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
+
+  collect-gptoss-1k1k-results:
+    needs: gptoss-1k1k
+    if: ${{ inputs.run_1k1k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: 'gptoss_1k1k'
+
+  dsr1-8k1k:
+    if: ${{ inputs.run_8k1k }}
+    uses: ./.github/workflows/dsr1-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: 'dsr1_8k1k'
+      isl: 8192
+      osl: 1024
+      max-model-len: 9216
+      random-range-ratio: 0.8
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
+      use_gb200: ${{ inputs.use_gb200 }}
+
+  collect-dsr1-8k1k-results:
+    needs: dsr1-8k1k
+    if: ${{ inputs.run_8k1k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: 'dsr1_8k1k'
+
+  gptoss-8k1k:
+    if: ${{ inputs.run_8k1k }}
+    uses: ./.github/workflows/gptoss-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: 'gptoss_8k1k'
+      isl: 8192
+      osl: 1024
+      max-model-len: 9216
+      random-range-ratio: 0.8
+      use_h100: ${{ inputs.use_h100 }}
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
+
+  collect-gptoss-8k1k-results:
+    needs: gptoss-8k1k
+    if: ${{ inputs.run_8k1k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: 'gptoss_8k1k'
+
+  dsr1-1k8k:
+    if: ${{ inputs.run_1k8k }}
+    uses: ./.github/workflows/dsr1-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: 'dsr1_1k8k'
+      isl: 1024
+      osl: 8192
+      max-model-len: 9216
+      random-range-ratio: 0.8
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
+      use_gb200: ${{ inputs.use_gb200 }}
+
+  collect-dsr1-1k8k-results:
+    needs: dsr1-1k8k
+    if: ${{ inputs.run_1k8k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: 'dsr1_1k8k'
+
+  gptoss-1k8k:
+    if: ${{ inputs.run_1k8k }}
+    uses: ./.github/workflows/gptoss-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: 'gptoss_1k8k'
+      isl: 1024
+      osl: 8192
+      max-model-len: 9216
+      random-range-ratio: 0.8
+      use_h100: ${{ inputs.use_h100 }}
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
+
+  collect-gptoss-1k8k-results:
+    needs: gptoss-1k8k
+    if: ${{ inputs.run_1k8k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: 'gptoss_1k8k'
diff --git a/.github/workflows/runner-model-sweep-test.yml b/.github/workflows/runner-model-sweep-test.yml
new file mode 100644
index 000000000..e4f2b7303
--- /dev/null
+++ b/.github/workflows/runner-model-sweep-test.yml
@@ -0,0 +1,289 @@
+name: 'Test - Runner Model Sweep'
+run-name: '${{ github.event.inputs.runner }} Sweep'
+on:
+  workflow_dispatch:
+    inputs:
+      runner:
+        description: 'Runner Type'
+        required: true
+        type: choice
+        options:
+          - 'h100'
+          - 'h200'
+          - 'h200-trt'
+          - 'b200'
+          - 'b200-trt'
+          - 'mi300x'
+          - 'mi325x'
+          - 'mi355x'
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
+
+jobs:
+  bmk-h100:
+    if: ${{ inputs.runner == 'h100' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'h100-cr_0'
+          - 'h100-cr_1'
+          - 'h100-cw_0'
+          - 'h100-cw_1'
+        config:
+          - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      framework: ${{ matrix.config.framework }}
+      precision: ${{ matrix.config.precision }}
+      exp-name: ${{ matrix.config.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[1]'
+
+  bmk-h200:
+    if: ${{ inputs.runner == 'h200' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'h200-cw_0'
+          - 'h200-cw_1'
+          - 'h200-nb_0'
+          - 'h200-nb_1'
+          - 'h200-nb_2'
+          - 'h200-nb_3'
+          - 'h200-nv_0'
+          - 'h200-nv_1'
+          - 'h200-nv_2'
+          - 'h200-nv_3'
+        config:
+          - { image: 'lmsysorg/sglang:v0.5.2rc2-cu126', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
+          - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      framework: ${{ matrix.config.framework }}
+      precision: ${{ matrix.config.precision }}
+      exp-name: ${{ matrix.config.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[1]'
+
+  bmk-h200-trt:
+    if: ${{ inputs.runner == 'h200-trt' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'h200-cw_0'
+          - 'h200-cw_1'
+          - 'h200-nb_0'
+          - 'h200-nb_1'
+          - 'h200-nb_2'
+          - 'h200-nb_3'
+          - 'h200-nv_0'
+          - 'h200-nv_1'
+          - 'h200-nv_2'
+          - 'h200-nv_3'
+        config:
+          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
+          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      framework: ${{ matrix.config.framework }}
+      precision: ${{ matrix.config.precision }}
+      exp-name: ${{ matrix.config.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[1]'
+
+  bmk-b200:
+    if: ${{ inputs.runner == 'b200' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'b200-nvd_0'
+          - 'b200-nvd_1'
+          - 'b200-nvd_2'
+          - 'b200-nvd_3'
+        config:
+          - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
+          - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
+          - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      framework: ${{ matrix.config.framework }}
+      precision: ${{ matrix.config.precision }}
+      exp-name: ${{ matrix.config.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[4]'
+
+  bmk-b200-trt:
+    if: ${{ inputs.runner == 'b200-trt' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'b200-nv_0'
+          - 'b200-nv_1'
+          - 'b200-nb_0'
+          - 'b200-nb_1'
+        config:
+          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
+          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'trt', precision: 'fp4', exp-name: 'dsr1_test' }
+          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      framework: ${{ matrix.config.framework }}
+      precision: ${{ matrix.config.precision }}
+      exp-name: ${{ matrix.config.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[1]'
+
+  bmk-mi300x:
+    if: ${{ inputs.runner == 'mi300x' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'mi300x-amd_0'
+          - 'mi300x-amd_1'
+          - 'mi300x-amd_2'
+          - 'mi300x-amd_3'
+          - 'mi300x-amd_4'
+          - 'mi300x-cr_0'
+          - 'mi300x-oci_0'
+        config:
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      framework: ${{ matrix.config.framework }}
+      precision: ${{ matrix.config.precision }}
+      exp-name: ${{ matrix.config.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[1]'
+
+  bmk-mi325x:
+    if: ${{ inputs.runner == 'mi325x' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'mi325x-amd_0'
+          - 'mi325x-tw_0'
+          - 'mi325x-tw_1'
+          - 'mi325x-tw_2'
+          - 'mi325x-tw_3'
+        config:
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      framework: ${{ matrix.config.framework }}
+      precision: ${{ matrix.config.precision }}
+      exp-name: ${{ matrix.config.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[1]'
+
+  bmk-mi355x:
+    if: ${{ inputs.runner == 'mi355x' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'mi355x-amd_0'
+          - 'mi355x-amd_1'
+          - 'mi355x-amd_2'
+          - 'mi355x-amd_3'
+        config:
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ matrix.config.image }}
+      model: ${{ matrix.config.model }}
+      framework: ${{ matrix.config.framework }}
+      precision: ${{ matrix.config.precision }}
+      exp-name: ${{ matrix.config.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[1]'
diff --git a/.github/workflows/runner-sweep-test.yml b/.github/workflows/runner-sweep-test.yml
new file mode 100644
index 000000000..8f824c4d1
--- /dev/null
+++ b/.github/workflows/runner-sweep-test.yml
@@ -0,0 +1,328 @@
+name: 'Test - Runner Sweep'
+run-name: '${{ github.event.inputs.runner }} Sweep - ${{ github.event.inputs.model }}'
+on:
+  workflow_dispatch:
+    inputs:
+      runner:
+        description: 'Runner Type'
+        required: true
+        type: choice
+        options:
+          - 'h100'
+          - 'h200'
+          - 'b200'
+          - 'h200-trt'
+          - 'b200-trt'
+          - 'mi300x'
+          - 'mi325x'
+          - 'mi355x'
+          - 'gb200'
+
+      image:
+        description: 'Docker Image'
+        required: true
+        type: choice
+        options:
+          - 'lmsysorg/sglang:v0.4.9.post1-cu126'
+          - 'lmsysorg/sglang:v0.5.0rc1-cu128-b200'
+          - 'lmsysorg/sglang:v0.5.2rc2-cu126'
+          - 'lmsysorg/sglang:v0.5.3rc1-cu129-b200'
+          - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2'
+          - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1'
+          - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
+          - 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1'
+          - 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev'
+          - 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3'
+          - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915'
+          - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915'
+          - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250915'
+          - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
+          - 'vllm/vllm-openai:v0.10.2'
+
+      model:
+        description: 'Model'
+        required: true
+        type: choice
+        options:
+          - 'amd/DeepSeek-R1-0528-MXFP4-Preview'
+          - 'deepseek-ai/DeepSeek-R1-0528'
+          - 'nvidia/DeepSeek-R1-0528-FP4'
+          - 'nvidia/DeepSeek-R1-0528-FP4-v2'
+          - 'openai/gpt-oss-120b'
+
+      framework:
+        description: 'Framework'
+        required: true
+        type: choice
+        options:
+          - 'vllm'
+          - 'sglang'
+          - 'trt'
+
+      precision:
+        description: 'Precision'
+        required: true
+        type: choice
+        options:
+          - 'fp8'
+          - 'fp4'
+
+      exp-name:
+        description: 'Experiment Name'
+        required: true
+        type: choice
+        options:
+          - 'dsr1_test'
+          - 'gptoss_test'
+
+
+env:
+  HF_TOKEN: ${{ secrets.HF_TOKEN }}
+  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
+
+jobs:
+  bmk_h100:
+    if: ${{ inputs.runner == 'h100' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'h100-cr_0'
+          - 'h100-cr_1'
+          - 'h100-cw_0'
+          - 'h100-cw_1'
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ inputs.image }}
+      model: ${{ inputs.model }}
+      framework: ${{ inputs.framework }}
+      precision: ${{ inputs.precision }}
+      exp-name: ${{ inputs.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[1]'
+
+  bmk_h200:
+    if: ${{ inputs.runner == 'h200' || inputs.runner == 'h200-trt' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'h200-cw_0'
+          - 'h200-cw_1'
+          - 'h200-nb_0'
+          - 'h200-nb_1'
+          - 'h200-nb_2'
+          - 'h200-nb_3'
+          - 'h200-nv_0'
+          - 'h200-nv_1'
+          - 'h200-nv_2'
+          - 'h200-nv_3'
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ inputs.image }}
+      model: ${{ inputs.model }}
+      framework: ${{ inputs.framework }}
+      precision: ${{ inputs.precision }}
+      exp-name: ${{ inputs.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[4]'
+      conc-list: '[64]'
+
+  bmk_b200:
+    if: ${{ inputs.runner == 'b200' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'b200-nv_0'
+          - 'b200-nv_1'
+          - 'b200-nvd_0'
+          - 'b200-nvd_1'
+          - 'b200-tg_0'
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ inputs.image }}
+      model: ${{ inputs.model }}
+      framework: ${{ inputs.framework }}
+      precision: ${{ inputs.precision }}
+      exp-name: ${{ inputs.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[1]'
+
+  bmk_b200-trt:
+    if: ${{ inputs.runner == 'b200-trt' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'b200-nv_0'
+          - 'b200-nv_1'
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ inputs.image }}
+      model: ${{ inputs.model }}
+      framework: ${{ inputs.framework }}
+      precision: ${{ inputs.precision }}
+      exp-name: ${{ inputs.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[1]'
+
+  bmk_mi300x:
+    if: ${{ inputs.runner == 'mi300x' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'mi300x-amd_0'
+          - 'mi300x-amd_1'
+          - 'mi300x-amd_2'
+          - 'mi300x-amd_3'
+          - 'mi300x-amd_4'
+          - 'mi300x-cr_0'
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ inputs.image }}
+      model: ${{ inputs.model }}
+      framework: ${{ inputs.framework }}
+      precision: ${{ inputs.precision }}
+      exp-name: ${{ inputs.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[1]'
+
+  bmk_mi325x:
+    if: ${{ inputs.runner == 'mi325x' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'mi325x-amd_0'
+          - 'mi325x-tw_0'
+          - 'mi325x-tw_1'
+          - 'mi325x-tw_2'
+          - 'mi325x-tw_3'
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ inputs.image }}
+      model: ${{ inputs.model }}
+      framework: ${{ inputs.framework }}
+      precision: ${{ inputs.precision }}
+      exp-name: ${{ inputs.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[1]'
+
+  bmk_mi355x:
+    if: ${{ inputs.runner == 'mi355x' }}
+    strategy:
+      fail-fast: false
+      matrix:
+        runner:
+          - 'mi355x-amd_0'
+          - 'mi355x-amd_1'
+          - 'mi355x-amd_2'
+          - 'mi355x-amd_3'
+
+    name: '${{ matrix.runner }}'
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: ${{ matrix.runner }}
+      image: ${{ inputs.image }}
+      model: ${{ inputs.model }}
+      framework: ${{ inputs.framework }}
+      precision: ${{ inputs.precision }}
+      exp-name: ${{ inputs.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      tp-list: '[8]'
+      conc-list: '[1]'
+
+  bmk_gb200:
+    if: ${{ inputs.runner == 'gb200' && inputs.framework == 'trt' }}
+    uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+    secrets: inherit
+    with:
+      runner: gb200
+      image: 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3'
+      model: 'deepseek-r1-fp4'
+      framework: 'dynamo-trtllm'
+      precision: 'fp4'
+      exp-name: ${{ inputs.exp-name }}
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      mtp-mode: 'off'
+
+  bmk_gb200-sgl:
+    if: ${{ inputs.runner == 'gb200' && inputs.framework == 'sglang' }}
+    uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+    secrets: inherit
+    with:
+      runner: gb200
+      image: 'nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1'
+      model: 'deepseek-ai/DeepSeek-R1-0528'
+      framework: 'dynamo-sglang'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: 8192
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      mtp-mode: 'off'
+
+  collect-test-results:
+    needs: [ bmk_h100, bmk_h200, bmk_b200, bmk_b200-trt, bmk_mi300x, bmk_mi325x, bmk_mi355x, bmk_gb200, bmk_gb200-sgl ]
+    if: ${{ always() && !cancelled() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: ${{ inputs.exp-name }}
diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh
deleted file mode 100644
index ad24453b3..000000000
--- a/benchmarks/70b_fp4_b200_trt_slurm.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-hf download $MODEL
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-
-set -x
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-# Launch TRT-LLM server
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh
deleted file mode 100644
index ad24453b3..000000000
--- a/benchmarks/70b_fp8_b200_trt_slurm.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-hf download $MODEL
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-
-set -x
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-# Launch TRT-LLM server
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh
deleted file mode 100644
index 094fbd19c..000000000
--- a/benchmarks/70b_fp8_h200_slurm.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-set -x
-hf download $MODEL
-pip install datasets pandas
-
-# Calculate max-model-len based on ISL and OSL
-if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
-    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
-elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
-    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
-else
-    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
-fi
-
-# Create config.yaml
-cat > config.yaml << EOF
-kv-cache-dtype: fp8
-async-scheduling: true
-no-enable-prefix-caching: true
-max-num-batched-tokens: 8192
-max-model-len: $CALCULATED_MAX_MODEL_LEN
-EOF
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-export TORCH_CUDA_ARCH_LIST="9.0"
-
-PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
- --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC  \
- --disable-log-requests > $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh
deleted file mode 100644
index dfb2324b9..000000000
--- a/benchmarks/70b_fp8_h200_trt_slurm.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-hf download $MODEL
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
\ No newline at end of file
diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh
deleted file mode 100644
index 1febeff13..000000000
--- a/benchmarks/70b_fp8_mi325x_slurm.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-huggingface-cli download $MODEL
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-	if [[ "$CONC" -ge "16" ]]; then
-		export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-	fi
-fi
-
-# Patch the aiter config script to deal
-# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch.
-file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py'
-sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#'  $file_to_patch
-
-
-# In this specific case, float16 performs better than the datatype
-# picked by vllm when using auto for --dtype (bfloat16).
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=float16 --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests \
-> $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json

From 24ea7def4aeaade77e95e35ae0cab30c5259a9ff Mon Sep 17 00:00:00 2001
From: "kimbo@semianalysis.com" <you@example.com>
Date: Thu, 30 Oct 2025 02:05:35 +0000
Subject: [PATCH 074/149] revert remove: llama 70b

---
 .github/workflows/70b-tmpl.yml                | 230 ++++++++++++++++++
 .github/workflows/full-sweep-tmpl.yml         |  75 ++++++
 .github/workflows/runner-model-sweep-test.yml |  11 +
 .github/workflows/runner-sweep-test.yml       |   5 +
 benchmarks/70b_fp4_b200_docker.sh             |  48 ++++
 benchmarks/70b_fp4_b200_trt_docker.sh         |  46 ++++
 benchmarks/70b_fp4_b200_trt_slurm.sh          |  75 ++++++
 benchmarks/70b_fp4_mi355x_docker.sh           |  55 +++++
 benchmarks/70b_fp4_mi355x_slurm.sh            |  84 +++++++
 benchmarks/70b_fp8_b200_docker.sh             |  46 ++++
 benchmarks/70b_fp8_b200_trt_docker.sh         |  46 ++++
 benchmarks/70b_fp8_b200_trt_slurm.sh          |  75 ++++++
 benchmarks/70b_fp8_h100_docker.sh             |  29 +++
 benchmarks/70b_fp8_h100_slurm.sh              |  60 +++++
 benchmarks/70b_fp8_h200_slurm.sh              |  69 ++++++
 benchmarks/70b_fp8_h200_trt_slurm.sh          |  70 ++++++
 benchmarks/70b_fp8_mi300x_docker.sh           |  59 +++++
 benchmarks/70b_fp8_mi300x_slurm.sh            |  92 +++++++
 benchmarks/70b_fp8_mi325x_docker.sh           |  53 ++++
 benchmarks/70b_fp8_mi325x_slurm.sh            |  86 +++++++
 benchmarks/70b_fp8_mi355x_docker.sh           |  50 ++++
 benchmarks/70b_fp8_mi355x_slurm.sh            |  75 ++++++
 22 files changed, 1439 insertions(+)
 create mode 100644 .github/workflows/70b-tmpl.yml
 create mode 100644 benchmarks/70b_fp4_b200_docker.sh
 create mode 100644 benchmarks/70b_fp4_b200_trt_docker.sh
 create mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh
 create mode 100644 benchmarks/70b_fp4_mi355x_docker.sh
 create mode 100644 benchmarks/70b_fp4_mi355x_slurm.sh
 create mode 100644 benchmarks/70b_fp8_b200_docker.sh
 create mode 100644 benchmarks/70b_fp8_b200_trt_docker.sh
 create mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh
 create mode 100755 benchmarks/70b_fp8_h100_docker.sh
 create mode 100644 benchmarks/70b_fp8_h100_slurm.sh
 create mode 100644 benchmarks/70b_fp8_h200_slurm.sh
 create mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh
 create mode 100644 benchmarks/70b_fp8_mi300x_docker.sh
 create mode 100644 benchmarks/70b_fp8_mi300x_slurm.sh
 create mode 100644 benchmarks/70b_fp8_mi325x_docker.sh
 create mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh
 create mode 100644 benchmarks/70b_fp8_mi355x_docker.sh
 create mode 100644 benchmarks/70b_fp8_mi355x_slurm.sh

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
new file mode 100644
index 000000000..3d1dd5051
--- /dev/null
+++ b/.github/workflows/70b-tmpl.yml
@@ -0,0 +1,230 @@
+name: Template - LLaMA 70B
+
+on:
+  workflow_call:
+    inputs:
+      exp-name:
+        required: true
+        type: string
+      isl:
+        required: true
+        type: string
+      osl:
+        required: true
+        type: string
+      max-model-len:
+        required: true
+        type: string
+      random-range-ratio:
+        required: true
+        type: string
+
+      use_h100:
+        type: boolean
+        required: true
+      use_h200:
+        type: boolean
+        required: true
+      use_b200:
+        type: boolean
+        required: true
+      use_mi300x:
+        type: boolean
+        required: true
+      use_mi325x:
+        type: boolean
+        required: true
+      use_mi355x:
+        type: boolean
+        required: true
+
+jobs:
+  bmk-h100-fp8:
+    if: ${{ inputs.use_h100 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: h100
+      image: 'vllm/vllm-openai:v0.10.2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      framework: 'vllm'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[2, 4, 8]'
+
+  bmk-h200-fp8:
+    if: ${{ inputs.use_h200 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: h200
+      image: 'vllm/vllm-openai:v0.10.2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      framework: 'vllm'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'
+
+  bmk-h200-trt-fp8:
+    if: ${{ inputs.use_h200 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: h200-trt
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      framework: 'trt'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'  
+      conc-list: '[4, 8, 16, 32, 64, 128]'  # H200 can achieve TPS/User >= 30 with larger concurrency till 128
+
+  bmk-b200-fp8:
+    if: ${{ inputs.use_b200 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: b200
+      image: 'vllm/vllm-openai:v0.10.2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      framework: 'vllm'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
+
+  bmk-b200-trt-fp8:
+    if: ${{ inputs.use_b200 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: b200-trt
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
+      framework: 'trt'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
+      conc-list: '[4, 8, 16, 32, 64, 128]'  # B200 can achieve TPS/User >= 30 with larger concurrency till 256
+
+  bmk-mi300x-fp8:
+    if: ${{ inputs.use_mi300x }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: mi300x
+      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
+      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+      framework: 'vllm'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'
+
+  bmk-mi325x-fp8:
+    if: ${{ inputs.use_mi325x }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: mi325x
+      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
+      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+      framework: 'vllm'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'
+
+  bmk-mi355x-fp8:
+    if: ${{ inputs.use_mi355x }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: mi355x
+      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
+      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+      framework: 'vllm'
+      precision: 'fp8'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'
+
+  bmk-b200-fp4:
+    if: ${{ inputs.use_b200 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: b200
+      image: 'vllm/vllm-openai:v0.10.2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP4'
+      framework: 'vllm'
+      precision: 'fp4'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'  # fix: add TP=2,4 to B200, just as mi355 has
+
+  bmk-b200-trt-fp4:
+    if: ${{ inputs.use_b200 }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: b200-trt
+      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
+      model: 'nvidia/Llama-3.3-70B-Instruct-FP4'
+      framework: 'trt'
+      precision: 'fp4'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
+      conc-list: '[4, 8, 16, 32, 64, 128]'  # B200 can achieve TPS/User >= 30 with larger concurrency till 128
+
+  bmk-mi355x-fp4:
+    if: ${{ inputs.use_mi355x }}
+    uses: ./.github/workflows/benchmark-tmpl.yml
+    secrets: inherit
+    with:
+      runner: mi355x
+      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
+      model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview'
+      framework: 'vllm'
+      precision: 'fp4'
+      exp-name: ${{ inputs.exp-name }}
+      isl: ${{ inputs.isl }}
+      osl: ${{ inputs.osl }}
+      max-model-len: ${{ inputs.max-model-len }}
+      random-range-ratio: ${{ inputs.random-range-ratio }}
+      tp-list: '[1, 2, 4, 8]'
diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml
index 869928cb7..b086460df 100644
--- a/.github/workflows/full-sweep-tmpl.yml
+++ b/.github/workflows/full-sweep-tmpl.yml
@@ -37,6 +37,31 @@ on:
         default: false
 
 jobs:
+  _70b-1k1k:
+    if: ${{ inputs.run_1k1k }}
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_1k1k'
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      use_h100: ${{ inputs.use_h100 }}
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
+
+  collect-70b-1k1k-results:
+    needs: _70b-1k1k
+    if: ${{ inputs.run_1k1k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_1k1k'
+
   dsr1-1k1k:
     if: ${{ inputs.run_1k1k }}
     uses: ./.github/workflows/dsr1-tmpl.yml
@@ -87,6 +112,31 @@ jobs:
     with:
       exp-name: 'gptoss_1k1k'
 
+  _70b-8k1k:
+    if: ${{ inputs.run_8k1k }}
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_8k1k'
+      isl: 8192
+      osl: 1024
+      max-model-len: 9216
+      random-range-ratio: 0.8
+      use_h100: ${{ inputs.use_h100 }}
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
+
+  collect-70b-8k1k-results:
+    needs: _70b-8k1k
+    if: ${{ inputs.run_8k1k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_8k1k'
+
   dsr1-8k1k:
     if: ${{ inputs.run_8k1k }}
     uses: ./.github/workflows/dsr1-tmpl.yml
@@ -137,6 +187,31 @@ jobs:
     with:
       exp-name: 'gptoss_8k1k'
 
+  _70b-1k8k:
+    if: ${{ inputs.run_1k8k }}
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_1k8k'
+      isl: 1024
+      osl: 8192
+      max-model-len: 9216
+      random-range-ratio: 0.8
+      use_h100: ${{ inputs.use_h100 }}
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
+
+  collect-70b-1k8k-results:
+    needs: _70b-1k8k
+    if: ${{ inputs.run_1k8k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_1k8k'
+
   dsr1-1k8k:
     if: ${{ inputs.run_1k8k }}
     uses: ./.github/workflows/dsr1-tmpl.yml
diff --git a/.github/workflows/runner-model-sweep-test.yml b/.github/workflows/runner-model-sweep-test.yml
index e4f2b7303..212ffc07c 100644
--- a/.github/workflows/runner-model-sweep-test.yml
+++ b/.github/workflows/runner-model-sweep-test.yml
@@ -33,6 +33,7 @@ jobs:
           - 'h100-cw_0'
           - 'h100-cw_1'
         config:
+          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
 
     name: '${{ matrix.runner }}'
@@ -69,6 +70,7 @@ jobs:
           - 'h200-nv_2'
           - 'h200-nv_3'
         config:
+          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'lmsysorg/sglang:v0.5.2rc2-cu126', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
 
@@ -106,6 +108,7 @@ jobs:
           - 'h200-nv_2'
           - 'h200-nv_3'
         config:
+          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
 
@@ -137,6 +140,8 @@ jobs:
           - 'b200-nvd_2'
           - 'b200-nvd_3'
         config:
+          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
+          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' }
           - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
           - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
@@ -169,6 +174,8 @@ jobs:
           - 'b200-nb_0'
           - 'b200-nb_1'
         config:
+          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' }
+          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'trt', precision: 'fp4', exp-name: '70b_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'trt', precision: 'fp4', exp-name: 'dsr1_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
@@ -204,6 +211,7 @@ jobs:
           - 'mi300x-cr_0'
           - 'mi300x-oci_0'
         config:
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
 
@@ -236,6 +244,7 @@ jobs:
           - 'mi325x-tw_2'
           - 'mi325x-tw_3'
         config:
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
 
@@ -267,6 +276,8 @@ jobs:
           - 'mi355x-amd_2'
           - 'mi355x-amd_3'
         config:
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
+          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
diff --git a/.github/workflows/runner-sweep-test.yml b/.github/workflows/runner-sweep-test.yml
index 8f824c4d1..fd100474f 100644
--- a/.github/workflows/runner-sweep-test.yml
+++ b/.github/workflows/runner-sweep-test.yml
@@ -45,7 +45,11 @@ on:
         type: choice
         options:
           - 'amd/DeepSeek-R1-0528-MXFP4-Preview'
+          - 'amd/Llama-3.3-70B-Instruct-FP8-KV'
+          - 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview'
           - 'deepseek-ai/DeepSeek-R1-0528'
+          - 'nvidia/Llama-3.3-70B-Instruct-FP8'
+          - 'nvidia/Llama-3.3-70B-Instruct-FP4'
           - 'nvidia/DeepSeek-R1-0528-FP4'
           - 'nvidia/DeepSeek-R1-0528-FP4-v2'
           - 'openai/gpt-oss-120b'
@@ -72,6 +76,7 @@ on:
         required: true
         type: choice
         options:
+          - '70b_test'
           - 'dsr1_test'
           - 'gptoss_test'
 
diff --git a/benchmarks/70b_fp4_b200_docker.sh b/benchmarks/70b_fp4_b200_docker.sh
new file mode 100644
index 000000000..a76ffb9f8
--- /dev/null
+++ b/benchmarks/70b_fp4_b200_docker.sh
@@ -0,0 +1,48 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+nvidia-smi
+
+# To improve CI stability, we patch this helper function to prevent a race condition that
+# happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779
+sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py
+
+# Calculate max-model-len based on ISL and OSL
+if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
+    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
+elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
+    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
+else
+    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
+fi
+
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}'
+async-scheduling: true
+no-enable-prefix-caching: true
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+export TORCH_CUDA_ARCH_LIST="10.0"
+export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
+export PYTHONNOUSERSITE=1
+
+set -x
+vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
+--gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
+--disable-log-requests
diff --git a/benchmarks/70b_fp4_b200_trt_docker.sh b/benchmarks/70b_fp4_b200_trt_docker.sh
new file mode 100644
index 000000000..e30478672
--- /dev/null
+++ b/benchmarks/70b_fp4_b200_trt_docker.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+set -x
+# Launch TRT-LLM server
+mpirun -n 1 --allow-run-as-root --oversubscribe trtllm-serve $MODEL --tp_size $TP --trust_remote_code \
+--max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT
diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh
new file mode 100644
index 000000000..ad24453b3
--- /dev/null
+++ b/benchmarks/70b_fp4_b200_trt_slurm.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+
+set -x
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp4_mi355x_docker.sh b/benchmarks/70b_fp4_mi355x_docker.sh
new file mode 100644
index 000000000..681a629fb
--- /dev/null
+++ b/benchmarks/70b_fp4_mi355x_docker.sh
@@ -0,0 +1,55 @@
+#!/usr/bin/env bash
+
+# ========= Required Env Vars =========
+# HF_TOKEN
+# HF_HUB_CACHE
+# MODEL
+# PORT
+# TP
+# CONC
+# MAX_MODEL_LEN
+
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
+if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+	export VLLM_ROCM_USE_AITER_MHA=0
+	if [[ "$CONC" -le "16" ]]; then
+		export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
+	else
+		export VLLM_TRITON_FP4_GEMM_USE_ASM=1
+	fi
+elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+	export VLLM_ROCM_USE_AITER_MHA=0
+	if [[ "$CONC" -le "16" ]]; then
+		export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
+	else
+		export VLLM_TRITON_FP4_GEMM_USE_ASM=1
+	fi
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+	if [[ "$CONC" -ge "16" ]]; then
+		export VLLM_ROCM_USE_AITER_MHA=1
+	else
+		export VLLM_ROCM_USE_AITER_MHA=0
+	fi
+	if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then
+		export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
+	else
+		export VLLM_TRITON_FP4_GEMM_USE_ASM=1
+	fi
+fi
+
+set -x
+vllm serve $MODEL \
+--host=0.0.0.0 \
+--port $PORT \
+--swap-space 64 \
+--max-model-len $MAX_MODEL_LEN \
+--tensor-parallel-size $TP \
+--max-num-seqs 1024 \
+--kv-cache-dtype fp8 \
+--gpu-memory-utilization 0.94 \
+--max-seq-len-to-capture $MAX_MODEL_LEN \
+--max-num-batched-tokens 131072 \
+--no-enable-prefix-caching \
+--disable-log-requests \
+--async-scheduling
diff --git a/benchmarks/70b_fp4_mi355x_slurm.sh b/benchmarks/70b_fp4_mi355x_slurm.sh
new file mode 100644
index 000000000..0d5a469d0
--- /dev/null
+++ b/benchmarks/70b_fp4_mi355x_slurm.sh
@@ -0,0 +1,84 @@
+#!/usr/bin/env bash
+
+# ========= Required Env Vars =========
+# HF_TOKEN
+# HF_HUB_CACHE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# PORT
+# RESULT_FILENAME
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=8888
+
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
+if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+        export VLLM_ROCM_USE_AITER_MHA=0
+        if [[ "$CONC" -le "16" ]]; then
+                export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
+        else
+                export VLLM_TRITON_FP4_GEMM_USE_ASM=1
+        fi
+elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+        export VLLM_ROCM_USE_AITER_MHA=0
+        if [[ "$CONC" -le "16" ]]; then
+                export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
+        else
+                export VLLM_TRITON_FP4_GEMM_USE_ASM=1
+        fi
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+	if [[ "$CONC" -ge "16" ]]; then
+		export VLLM_ROCM_USE_AITER_MHA=1
+	else
+		export VLLM_ROCM_USE_AITER_MHA=0
+	fi
+	if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then
+		export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
+	else
+		export VLLM_TRITON_FP4_GEMM_USE_ASM=1
+	fi
+fi
+
+
+set -x
+vllm serve $MODEL \
+--host=0.0.0.0 \
+--port $PORT \
+--swap-space 64 \
+--max-model-len $MAX_MODEL_LEN \
+--tensor-parallel-size $TP \
+--max-num-seqs 1024 \
+--kv-cache-dtype fp8 \
+--gpu-memory-utilization 0.94 \
+--max-seq-len-to-capture $MAX_MODEL_LEN \
+--max-num-batched-tokens 131072 \
+--no-enable-prefix-caching \
+--disable-log-requests \
+--async-scheduling > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend vllm \
+--base-url "http://0.0.0.0:$PORT" \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics "ttft,tpot,itl,e2el" \
+--result-dir /workspace/ --result-filename $RESULT_FILENAME.json
+
diff --git a/benchmarks/70b_fp8_b200_docker.sh b/benchmarks/70b_fp8_b200_docker.sh
new file mode 100644
index 000000000..dbcfaf6fd
--- /dev/null
+++ b/benchmarks/70b_fp8_b200_docker.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/bash
+
+# ========= Required Env Vars =========
+# HF_TOKEN
+# HF_HUB_CACHE
+# MODEL
+# PORT
+# TP
+# CONC
+# MAX_MODEL_LEN
+
+nvidia-smi
+
+# To improve CI stability, we patch this helper function to prevent a race condition that
+# happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779
+sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py
+
+
+
+FUSION_FLAG='{'\
+'"pass_config": {"enable_fi_allreduce_fusion": true, "enable_attn_fusion": true, "enable_noop": true},'\
+'"custom_ops": ["+quant_fp8", "+rms_norm"],'\
+'"cudagraph_mode": "FULL_DECODE_ONLY",'\
+'"splitting_ops": []'\
+'}'
+cat > config.yaml <<-EOF
+kv-cache-dtype: fp8
+compilation-config: '$FUSION_FLAG'
+async-scheduling: true
+no-enable-prefix-caching: true
+max-num-batched-tokens: 8192
+max-model-len: $MAX_MODEL_LEN
+EOF
+
+cat config.yaml  # Debugging
+
+export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
+export PYTHONNOUSERSITE=1
+
+set -x
+vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
+--gpu-memory-utilization=0.9 \
+--tensor-parallel-size=$TP \
+--max-num-seqs=512 \
+--config config.yaml \
+--disable-log-requests
diff --git a/benchmarks/70b_fp8_b200_trt_docker.sh b/benchmarks/70b_fp8_b200_trt_docker.sh
new file mode 100644
index 000000000..e30478672
--- /dev/null
+++ b/benchmarks/70b_fp8_b200_trt_docker.sh
@@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+set -x
+# Launch TRT-LLM server
+mpirun -n 1 --allow-run-as-root --oversubscribe trtllm-serve $MODEL --tp_size $TP --trust_remote_code \
+--max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT
diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh
new file mode 100644
index 000000000..ad24453b3
--- /dev/null
+++ b/benchmarks/70b_fp8_b200_trt_slurm.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+
+set -x
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h100_docker.sh b/benchmarks/70b_fp8_h100_docker.sh
new file mode 100755
index 000000000..5d8df1bac
--- /dev/null
+++ b/benchmarks/70b_fp8_h100_docker.sh
@@ -0,0 +1,29 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# MAX_MODEL_LEN
+# TP
+# CONC
+
+pip install -q datasets pandas
+
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+async-scheduling: true
+no-enable-prefix-caching: true
+max-num-batched-tokens: 8192
+max-model-len: 10240
+EOF
+
+export PYTHONNOUSERSITE=1
+
+vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
+--config=config.yaml \
+--gpu-memory-utilization=0.9 \
+--tensor-parallel-size=$TP \
+--max-num-seqs=$CONC  \
+--disable-log-requests
diff --git a/benchmarks/70b_fp8_h100_slurm.sh b/benchmarks/70b_fp8_h100_slurm.sh
new file mode 100644
index 000000000..485aa8817
--- /dev/null
+++ b/benchmarks/70b_fp8_h100_slurm.sh
@@ -0,0 +1,60 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+async-scheduling: true
+no-enable-prefix-caching: true
+max-num-batched-tokens: 8192
+max-model-len: 10240
+EOF
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+
+set -x
+PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
+--config=config.yaml \
+--gpu-memory-utilization=0.9 \
+--tensor-parallel-size=$TP \
+--max-num-seqs=$CONC  \
+--disable-log-requests > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+pip install -q datasets pandas
+git clone https://github.com/kimbochen/bench_serving.git
+set -x
+python3 bench_serving/benchmark_serving.py \
+--model=$MODEL \
+--backend=vllm \
+--base-url="http://0.0.0.0:$PORT" \
+--dataset-name=random \
+--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+--num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
+--request-rate=inf --ignore-eos \
+--save-result --percentile-metrics='ttft,tpot,itl,e2el' \
+--result-dir=/workspace/ \
+--result-filename=$RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh
new file mode 100644
index 000000000..094fbd19c
--- /dev/null
+++ b/benchmarks/70b_fp8_h200_slurm.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+set -x
+hf download $MODEL
+pip install datasets pandas
+
+# Calculate max-model-len based on ISL and OSL
+if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
+    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
+elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
+    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
+else
+    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
+fi
+
+# Create config.yaml
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+async-scheduling: true
+no-enable-prefix-caching: true
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+
+PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
+ --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC  \
+ --disable-log-requests > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend vllm \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh
new file mode 100644
index 000000000..dfb2324b9
--- /dev/null
+++ b/benchmarks/70b_fp8_h200_trt_slurm.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
\ No newline at end of file
diff --git a/benchmarks/70b_fp8_mi300x_docker.sh b/benchmarks/70b_fp8_mi300x_docker.sh
new file mode 100644
index 000000000..941e95023
--- /dev/null
+++ b/benchmarks/70b_fp8_mi300x_docker.sh
@@ -0,0 +1,59 @@
+#!/usr/bin/env bash
+
+# ========= Required Env Vars =========
+# HF_TOKEN
+# HF_HUB_CACHE
+# MODEL
+# PORT
+# TP
+# CONC
+# MAX_MODEL_LEN
+
+# Reference
+# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
+
+# If the machine runs a MEC FW older than 177, RCCL
+# cannot reclaim some memory.
+# Disable that features to avoid crashes.
+# This is related to the changes in the driver at:
+# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
+
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
+version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
+if [[ "$version" == "" || $version -lt 177 ]]; then
+  export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
+if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+    if [[ "$CONC" -ge "16" ]]; then
+        export VLLM_ROCM_USE_AITER_MHA=1
+    else
+		export VLLM_ROCM_USE_AITER_MHA=0
+    fi
+fi
+
+# In this specific case, float16 performs better than the datatype
+# picked by vllm when using auto for --dtype (bfloat16).
+set -x
+vllm serve $MODEL --port=$PORT \
+--swap-space=64 \
+--gpu-memory-utilization=0.94 \
+--dtype=float16 --kv-cache-dtype=fp8 \
+--distributed-executor-backend=mp --tensor-parallel-size=$TP \
+--max-model-len=$MAX_MODEL_LEN \
+--max-seq-len-to-capture=$MAX_MODEL_LEN \
+--max-num-seqs=$CONC \
+--max-num-batched-tokens=131072 \
+--no-enable-prefix-caching \
+--config config.yaml \
+--async-scheduling \
+--disable-log-requests
diff --git a/benchmarks/70b_fp8_mi300x_slurm.sh b/benchmarks/70b_fp8_mi300x_slurm.sh
new file mode 100644
index 000000000..b387505f0
--- /dev/null
+++ b/benchmarks/70b_fp8_mi300x_slurm.sh
@@ -0,0 +1,92 @@
+#!/usr/bin/bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+huggingface-cli download $MODEL
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=8888
+
+# Reference
+# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
+
+# If the machine runs a MEC FW older than 177, RCCL
+# cannot reclaim some memory.
+# Disable that features to avoid crashes.
+# This is related to the changes in the driver at:
+# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
+
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
+version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
+if [[ "$version" == "" || $version -lt 177 ]]; then
+  export HSA_NO_SCRATCH_RECLAIM=1
+fi
+
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
+if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+    if [[ "$CONC" -ge "16" ]]; then
+        export VLLM_ROCM_USE_AITER_MHA=1
+    else
+		export VLLM_ROCM_USE_AITER_MHA=0
+    fi
+fi
+
+# In this specific case, float16 performs better than the datatype
+# picked by vllm when using auto for --dtype (bfloat16).
+set -x
+vllm serve $MODEL --port=$PORT \
+--swap-space=64 \
+--gpu-memory-utilization=0.94 \
+--dtype=float16 --kv-cache-dtype=fp8 \
+--distributed-executor-backend=mp --tensor-parallel-size=$TP \
+--max-model-len=$MAX_MODEL_LEN \
+--max-seq-len-to-capture=$MAX_MODEL_LEN \
+--max-num-seqs=$CONC \
+--max-num-batched-tokens=131072 \
+--no-enable-prefix-caching \
+--config config.yaml \
+--async-scheduling \
+--disable-log-requests \
+> $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model=$MODEL --backend=vllm \
+--base-url="http://0.0.0.0:$PORT" \
+--dataset-name=random \
+--random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
+--num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
+--request-rate=inf --ignore-eos \
+--save-result --percentile-metrics='ttft,tpot,itl,e2el' \
+--result-dir=/workspace/ \
+--result-filename=$RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_mi325x_docker.sh b/benchmarks/70b_fp8_mi325x_docker.sh
new file mode 100644
index 000000000..9e1fcdf8b
--- /dev/null
+++ b/benchmarks/70b_fp8_mi325x_docker.sh
@@ -0,0 +1,53 @@
+#!/usr/bin/env bash
+
+# ========= Required Env Vars =========
+# HF_TOKEN
+# HF_HUB_CACHE
+# MODEL
+# PORT
+# TP
+# CONC
+# MAX_MODEL_LEN
+
+# Reference
+# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
+
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
+
+if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+    if [[ "$CONC" -ge "16" ]]; then
+        export VLLM_ROCM_USE_AITER_MHA=1
+    else
+		export VLLM_ROCM_USE_AITER_MHA=0
+    fi
+fi
+
+# Patch the aiter config script to deal
+# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch.
+file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py'
+sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#'  $file_to_patch
+
+
+# In this specific case, float16 performs better than the datatype
+# picked by vllm when using auto for --dtype (bfloat16).
+set -x
+vllm serve $MODEL --port=$PORT \
+--swap-space=64 \
+--gpu-memory-utilization=0.94 \
+--dtype=float16 --kv-cache-dtype=fp8 \
+--distributed-executor-backend=mp --tensor-parallel-size=$TP \
+--max-model-len=$MAX_MODEL_LEN \
+--max-seq-len-to-capture=$MAX_MODEL_LEN \
+--max-num-seqs=$CONC \
+--max-num-batched-tokens=131072 \
+--no-enable-prefix-caching \
+--config config.yaml \
+--async-scheduling \
+--disable-log-requests
diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh
new file mode 100644
index 000000000..1febeff13
--- /dev/null
+++ b/benchmarks/70b_fp8_mi325x_slurm.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+huggingface-cli download $MODEL
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+# Reference
+# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
+
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
+if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+	if [[ "$CONC" -ge "16" ]]; then
+		export VLLM_ROCM_USE_AITER_MHA=1
+    else
+		export VLLM_ROCM_USE_AITER_MHA=0
+	fi
+fi
+
+# Patch the aiter config script to deal
+# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch.
+file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py'
+sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#'  $file_to_patch
+
+
+# In this specific case, float16 performs better than the datatype
+# picked by vllm when using auto for --dtype (bfloat16).
+set -x
+vllm serve $MODEL --port=$PORT \
+--swap-space=64 \
+--gpu-memory-utilization=0.94 \
+--dtype=float16 --kv-cache-dtype=fp8 \
+--distributed-executor-backend=mp --tensor-parallel-size=$TP \
+--max-model-len=$MAX_MODEL_LEN \
+--max-seq-len-to-capture=$MAX_MODEL_LEN \
+--max-num-seqs=$CONC \
+--max-num-batched-tokens=131072 \
+--no-enable-prefix-caching \
+--config config.yaml \
+--async-scheduling \
+--disable-log-requests \
+> $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend vllm \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_mi355x_docker.sh b/benchmarks/70b_fp8_mi355x_docker.sh
new file mode 100644
index 000000000..6310a5f64
--- /dev/null
+++ b/benchmarks/70b_fp8_mi355x_docker.sh
@@ -0,0 +1,50 @@
+#!/usr/bin/env bash
+
+# ========= Required Env Vars =========
+# HF_TOKEN
+# HF_HUB_CACHE
+# MODEL
+# PORT
+# TP
+# CONC
+# MAX_MODEL_LEN
+
+# Reference
+# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
+
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
+sleep 5
+cat config.yaml
+
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
+if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+    if [[ "$CONC" -ge "16" ]]; then
+        export VLLM_ROCM_USE_AITER_MHA=1
+    else
+		export VLLM_ROCM_USE_AITER_MHA=0
+    fi
+fi
+
+set -x
+vllm serve $MODEL --port=$PORT \
+--swap-space=64 \
+--gpu-memory-utilization=0.94 \
+--dtype=auto --kv-cache-dtype=fp8 \
+--distributed-executor-backend=mp --tensor-parallel-size=$TP \
+--max-model-len=$MAX_MODEL_LEN \
+--max-seq-len-to-capture=$MAX_MODEL_LEN \
+--max-num-seqs=$CONC \
+--max-num-batched-tokens=131072 \
+--no-enable-prefix-caching \
+--config config.yaml \
+--async-scheduling \
+--disable-log-requests
+
diff --git a/benchmarks/70b_fp8_mi355x_slurm.sh b/benchmarks/70b_fp8_mi355x_slurm.sh
new file mode 100644
index 000000000..2abfee137
--- /dev/null
+++ b/benchmarks/70b_fp8_mi355x_slurm.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+# ========= Required Env Vars =========
+# HF_TOKEN
+# HF_HUB_CACHE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# PORT
+# RESULT_FILENAME
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+
+# Reference
+# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
+
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
+export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
+
+if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+    if [[ "$CONC" -ge "16" ]]; then
+        export VLLM_ROCM_USE_AITER_MHA=1
+    else
+		export VLLM_ROCM_USE_AITER_MHA=0
+    fi
+fi
+
+set -x
+vllm serve $MODEL --port=$PORT \
+--swap-space=64 \
+--gpu-memory-utilization=0.94 \
+--dtype=auto --kv-cache-dtype=fp8 \
+--distributed-executor-backend=mp --tensor-parallel-size=$TP \
+--max-model-len=$MAX_MODEL_LEN \
+--max-seq-len-to-capture=$MAX_MODEL_LEN \
+--max-num-seqs=$CONC \
+--max-num-batched-tokens=131072 \
+--no-enable-prefix-caching \
+--config config.yaml \
+--async-scheduling \
+--disable-log-requests \
+> $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend vllm \
+--base-url "http://0.0.0.0:$PORT" \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics "ttft,tpot,itl,e2el" \
+--result-dir /workspace/ --result-filename $RESULT_FILENAME.json
+
+exit

From b89047d5fc054f47e046433351a49014ec4cd95d Mon Sep 17 00:00:00 2001
From: functionstackx <47992694+functionstackx@users.noreply.github.com>
Date: Wed, 29 Oct 2025 22:18:46 -0400
Subject: [PATCH 075/149] remove llama 70b (#149)

---
 .github/workflows/70b-tmpl.yml                | 230 ------------------
 .github/workflows/full-sweep-tmpl.yml         |  75 ------
 .github/workflows/runner-model-sweep-test.yml |  11 -
 .github/workflows/runner-sweep-test.yml       |   5 -
 benchmarks/70b_fp4_b200_docker.sh             |  48 ----
 benchmarks/70b_fp4_b200_trt_docker.sh         |  46 ----
 benchmarks/70b_fp4_b200_trt_slurm.sh          |  75 ------
 benchmarks/70b_fp4_mi355x_docker.sh           |  55 -----
 benchmarks/70b_fp4_mi355x_slurm.sh            |  84 -------
 benchmarks/70b_fp8_b200_docker.sh             |  46 ----
 benchmarks/70b_fp8_b200_trt_docker.sh         |  46 ----
 benchmarks/70b_fp8_b200_trt_slurm.sh          |  75 ------
 benchmarks/70b_fp8_h100_docker.sh             |  29 ---
 benchmarks/70b_fp8_h100_slurm.sh              |  60 -----
 benchmarks/70b_fp8_h200_slurm.sh              |  69 ------
 benchmarks/70b_fp8_h200_trt_slurm.sh          |  70 ------
 benchmarks/70b_fp8_mi300x_docker.sh           |  59 -----
 benchmarks/70b_fp8_mi300x_slurm.sh            |  92 -------
 benchmarks/70b_fp8_mi325x_docker.sh           |  53 ----
 benchmarks/70b_fp8_mi325x_slurm.sh            |  86 -------
 benchmarks/70b_fp8_mi355x_docker.sh           |  50 ----
 benchmarks/70b_fp8_mi355x_slurm.sh            |  75 ------
 22 files changed, 1439 deletions(-)
 delete mode 100644 .github/workflows/70b-tmpl.yml
 delete mode 100644 benchmarks/70b_fp4_b200_docker.sh
 delete mode 100644 benchmarks/70b_fp4_b200_trt_docker.sh
 delete mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh
 delete mode 100644 benchmarks/70b_fp4_mi355x_docker.sh
 delete mode 100644 benchmarks/70b_fp4_mi355x_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_b200_docker.sh
 delete mode 100644 benchmarks/70b_fp8_b200_trt_docker.sh
 delete mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh
 delete mode 100755 benchmarks/70b_fp8_h100_docker.sh
 delete mode 100644 benchmarks/70b_fp8_h100_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_h200_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_mi300x_docker.sh
 delete mode 100644 benchmarks/70b_fp8_mi300x_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_mi325x_docker.sh
 delete mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_mi355x_docker.sh
 delete mode 100644 benchmarks/70b_fp8_mi355x_slurm.sh

diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
deleted file mode 100644
index 3d1dd5051..000000000
--- a/.github/workflows/70b-tmpl.yml
+++ /dev/null
@@ -1,230 +0,0 @@
-name: Template - LLaMA 70B
-
-on:
-  workflow_call:
-    inputs:
-      exp-name:
-        required: true
-        type: string
-      isl:
-        required: true
-        type: string
-      osl:
-        required: true
-        type: string
-      max-model-len:
-        required: true
-        type: string
-      random-range-ratio:
-        required: true
-        type: string
-
-      use_h100:
-        type: boolean
-        required: true
-      use_h200:
-        type: boolean
-        required: true
-      use_b200:
-        type: boolean
-        required: true
-      use_mi300x:
-        type: boolean
-        required: true
-      use_mi325x:
-        type: boolean
-        required: true
-      use_mi355x:
-        type: boolean
-        required: true
-
-jobs:
-  bmk-h100-fp8:
-    if: ${{ inputs.use_h100 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: h100
-      image: 'vllm/vllm-openai:v0.10.2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      framework: 'vllm'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[2, 4, 8]'
-
-  bmk-h200-fp8:
-    if: ${{ inputs.use_h200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: h200
-      image: 'vllm/vllm-openai:v0.10.2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      framework: 'vllm'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
-
-  bmk-h200-trt-fp8:
-    if: ${{ inputs.use_h200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: h200-trt
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      framework: 'trt'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'  
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # H200 can achieve TPS/User >= 30 with larger concurrency till 128
-
-  bmk-b200-fp8:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: b200
-      image: 'vllm/vllm-openai:v0.10.2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      framework: 'vllm'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
-
-  bmk-b200-trt-fp8:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: b200-trt
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP8'
-      framework: 'trt'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # B200 can achieve TPS/User >= 30 with larger concurrency till 256
-
-  bmk-mi300x-fp8:
-    if: ${{ inputs.use_mi300x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: mi300x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-      framework: 'vllm'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
-
-  bmk-mi325x-fp8:
-    if: ${{ inputs.use_mi325x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: mi325x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-      framework: 'vllm'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
-
-  bmk-mi355x-fp8:
-    if: ${{ inputs.use_mi355x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: mi355x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-      model: 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-      framework: 'vllm'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
-
-  bmk-b200-fp4:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: b200
-      image: 'vllm/vllm-openai:v0.10.2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP4'
-      framework: 'vllm'
-      precision: 'fp4'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'  # fix: add TP=2,4 to B200, just as mi355 has
-
-  bmk-b200-trt-fp4:
-    if: ${{ inputs.use_b200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: b200-trt
-      image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
-      model: 'nvidia/Llama-3.3-70B-Instruct-FP4'
-      framework: 'trt'
-      precision: 'fp4'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # B200 can achieve TPS/User >= 30 with larger concurrency till 128
-
-  bmk-mi355x-fp4:
-    if: ${{ inputs.use_mi355x }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: mi355x
-      image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-      model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview'
-      framework: 'vllm'
-      precision: 'fp4'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml
index b086460df..869928cb7 100644
--- a/.github/workflows/full-sweep-tmpl.yml
+++ b/.github/workflows/full-sweep-tmpl.yml
@@ -37,31 +37,6 @@ on:
         default: false
 
 jobs:
-  _70b-1k1k:
-    if: ${{ inputs.run_1k1k }}
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-
-  collect-70b-1k1k-results:
-    needs: _70b-1k1k
-    if: ${{ inputs.run_1k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k1k'
-
   dsr1-1k1k:
     if: ${{ inputs.run_1k1k }}
     uses: ./.github/workflows/dsr1-tmpl.yml
@@ -112,31 +87,6 @@ jobs:
     with:
       exp-name: 'gptoss_1k1k'
 
-  _70b-8k1k:
-    if: ${{ inputs.run_8k1k }}
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-
-  collect-70b-8k1k-results:
-    needs: _70b-8k1k
-    if: ${{ inputs.run_8k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_8k1k'
-
   dsr1-8k1k:
     if: ${{ inputs.run_8k1k }}
     uses: ./.github/workflows/dsr1-tmpl.yml
@@ -187,31 +137,6 @@ jobs:
     with:
       exp-name: 'gptoss_8k1k'
 
-  _70b-1k8k:
-    if: ${{ inputs.run_1k8k }}
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-
-  collect-70b-1k8k-results:
-    needs: _70b-1k8k
-    if: ${{ inputs.run_1k8k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k8k'
-
   dsr1-1k8k:
     if: ${{ inputs.run_1k8k }}
     uses: ./.github/workflows/dsr1-tmpl.yml
diff --git a/.github/workflows/runner-model-sweep-test.yml b/.github/workflows/runner-model-sweep-test.yml
index 212ffc07c..e4f2b7303 100644
--- a/.github/workflows/runner-model-sweep-test.yml
+++ b/.github/workflows/runner-model-sweep-test.yml
@@ -33,7 +33,6 @@ jobs:
           - 'h100-cw_0'
           - 'h100-cw_1'
         config:
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
 
     name: '${{ matrix.runner }}'
@@ -70,7 +69,6 @@ jobs:
           - 'h200-nv_2'
           - 'h200-nv_3'
         config:
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'lmsysorg/sglang:v0.5.2rc2-cu126', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
 
@@ -108,7 +106,6 @@ jobs:
           - 'h200-nv_2'
           - 'h200-nv_3'
         config:
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
 
@@ -140,8 +137,6 @@ jobs:
           - 'b200-nvd_2'
           - 'b200-nvd_3'
         config:
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' }
           - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
           - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
@@ -174,8 +169,6 @@ jobs:
           - 'b200-nb_0'
           - 'b200-nb_1'
         config:
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP8', framework: 'trt', precision: 'fp8', exp-name: '70b_test' }
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/Llama-3.3-70B-Instruct-FP4', framework: 'trt', precision: 'fp4', exp-name: '70b_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'trt', precision: 'fp4', exp-name: 'dsr1_test' }
           - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
@@ -211,7 +204,6 @@ jobs:
           - 'mi300x-cr_0'
           - 'mi300x-oci_0'
         config:
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
 
@@ -244,7 +236,6 @@ jobs:
           - 'mi325x-tw_2'
           - 'mi325x-tw_3'
         config:
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
 
@@ -276,8 +267,6 @@ jobs:
           - 'mi355x-amd_2'
           - 'mi355x-amd_3'
         config:
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
           - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
diff --git a/.github/workflows/runner-sweep-test.yml b/.github/workflows/runner-sweep-test.yml
index fd100474f..8f824c4d1 100644
--- a/.github/workflows/runner-sweep-test.yml
+++ b/.github/workflows/runner-sweep-test.yml
@@ -45,11 +45,7 @@ on:
         type: choice
         options:
           - 'amd/DeepSeek-R1-0528-MXFP4-Preview'
-          - 'amd/Llama-3.3-70B-Instruct-FP8-KV'
-          - 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview'
           - 'deepseek-ai/DeepSeek-R1-0528'
-          - 'nvidia/Llama-3.3-70B-Instruct-FP8'
-          - 'nvidia/Llama-3.3-70B-Instruct-FP4'
           - 'nvidia/DeepSeek-R1-0528-FP4'
           - 'nvidia/DeepSeek-R1-0528-FP4-v2'
           - 'openai/gpt-oss-120b'
@@ -76,7 +72,6 @@ on:
         required: true
         type: choice
         options:
-          - '70b_test'
           - 'dsr1_test'
           - 'gptoss_test'
 
diff --git a/benchmarks/70b_fp4_b200_docker.sh b/benchmarks/70b_fp4_b200_docker.sh
deleted file mode 100644
index a76ffb9f8..000000000
--- a/benchmarks/70b_fp4_b200_docker.sh
+++ /dev/null
@@ -1,48 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-nvidia-smi
-
-# To improve CI stability, we patch this helper function to prevent a race condition that
-# happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779
-sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py
-
-# Calculate max-model-len based on ISL and OSL
-if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
-    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
-elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
-    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
-else
-    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
-fi
-
-cat > config.yaml << EOF
-kv-cache-dtype: fp8
-compilation-config: '{"pass_config":{"enable_fi_allreduce_fusion":true,"enable_attn_fusion":true,"enable_noop":true},"custom_ops":["+quant_fp8","+rms_norm"],"cudagraph_mode":"FULL_DECODE_ONLY","splitting_ops":[]}'
-async-scheduling: true
-no-enable-prefix-caching: true
-max-num-batched-tokens: 8192
-max-model-len: $CALCULATED_MAX_MODEL_LEN
-EOF
-
-export TORCH_CUDA_ARCH_LIST="10.0"
-export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
-export PYTHONNOUSERSITE=1
-
-set -x
-vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
---gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs 512 \
---disable-log-requests
diff --git a/benchmarks/70b_fp4_b200_trt_docker.sh b/benchmarks/70b_fp4_b200_trt_docker.sh
deleted file mode 100644
index e30478672..000000000
--- a/benchmarks/70b_fp4_b200_trt_docker.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-set -x
-# Launch TRT-LLM server
-mpirun -n 1 --allow-run-as-root --oversubscribe trtllm-serve $MODEL --tp_size $TP --trust_remote_code \
---max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT
diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh
deleted file mode 100644
index ad24453b3..000000000
--- a/benchmarks/70b_fp4_b200_trt_slurm.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-hf download $MODEL
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-
-set -x
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-# Launch TRT-LLM server
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp4_mi355x_docker.sh b/benchmarks/70b_fp4_mi355x_docker.sh
deleted file mode 100644
index 681a629fb..000000000
--- a/benchmarks/70b_fp4_mi355x_docker.sh
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# PORT
-# TP
-# CONC
-# MAX_MODEL_LEN
-
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-	export VLLM_ROCM_USE_AITER_MHA=0
-	if [[ "$CONC" -le "16" ]]; then
-		export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
-	else
-		export VLLM_TRITON_FP4_GEMM_USE_ASM=1
-	fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-	export VLLM_ROCM_USE_AITER_MHA=0
-	if [[ "$CONC" -le "16" ]]; then
-		export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
-	else
-		export VLLM_TRITON_FP4_GEMM_USE_ASM=1
-	fi
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-	if [[ "$CONC" -ge "16" ]]; then
-		export VLLM_ROCM_USE_AITER_MHA=1
-	else
-		export VLLM_ROCM_USE_AITER_MHA=0
-	fi
-	if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then
-		export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
-	else
-		export VLLM_TRITON_FP4_GEMM_USE_ASM=1
-	fi
-fi
-
-set -x
-vllm serve $MODEL \
---host=0.0.0.0 \
---port $PORT \
---swap-space 64 \
---max-model-len $MAX_MODEL_LEN \
---tensor-parallel-size $TP \
---max-num-seqs 1024 \
---kv-cache-dtype fp8 \
---gpu-memory-utilization 0.94 \
---max-seq-len-to-capture $MAX_MODEL_LEN \
---max-num-batched-tokens 131072 \
---no-enable-prefix-caching \
---disable-log-requests \
---async-scheduling
diff --git a/benchmarks/70b_fp4_mi355x_slurm.sh b/benchmarks/70b_fp4_mi355x_slurm.sh
deleted file mode 100644
index 0d5a469d0..000000000
--- a/benchmarks/70b_fp4_mi355x_slurm.sh
+++ /dev/null
@@ -1,84 +0,0 @@
-#!/usr/bin/env bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# PORT
-# RESULT_FILENAME
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=8888
-
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=0
-        if [[ "$CONC" -le "16" ]]; then
-                export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
-        else
-                export VLLM_TRITON_FP4_GEMM_USE_ASM=1
-        fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=0
-        if [[ "$CONC" -le "16" ]]; then
-                export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
-        else
-                export VLLM_TRITON_FP4_GEMM_USE_ASM=1
-        fi
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-	if [[ "$CONC" -ge "16" ]]; then
-		export VLLM_ROCM_USE_AITER_MHA=1
-	else
-		export VLLM_ROCM_USE_AITER_MHA=0
-	fi
-	if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then
-		export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0
-	else
-		export VLLM_TRITON_FP4_GEMM_USE_ASM=1
-	fi
-fi
-
-
-set -x
-vllm serve $MODEL \
---host=0.0.0.0 \
---port $PORT \
---swap-space 64 \
---max-model-len $MAX_MODEL_LEN \
---tensor-parallel-size $TP \
---max-num-seqs 1024 \
---kv-cache-dtype fp8 \
---gpu-memory-utilization 0.94 \
---max-seq-len-to-capture $MAX_MODEL_LEN \
---max-num-batched-tokens 131072 \
---no-enable-prefix-caching \
---disable-log-requests \
---async-scheduling > $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url "http://0.0.0.0:$PORT" \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics "ttft,tpot,itl,e2el" \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json
-
diff --git a/benchmarks/70b_fp8_b200_docker.sh b/benchmarks/70b_fp8_b200_docker.sh
deleted file mode 100644
index dbcfaf6fd..000000000
--- a/benchmarks/70b_fp8_b200_docker.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# PORT
-# TP
-# CONC
-# MAX_MODEL_LEN
-
-nvidia-smi
-
-# To improve CI stability, we patch this helper function to prevent a race condition that
-# happens 1% of the time. ref: https://github.com/flashinfer-ai/flashinfer/pull/1779
-sed -i '102,108d' /usr/local/lib/python3.12/dist-packages/flashinfer/jit/cubin_loader.py
-
-
-
-FUSION_FLAG='{'\
-'"pass_config": {"enable_fi_allreduce_fusion": true, "enable_attn_fusion": true, "enable_noop": true},'\
-'"custom_ops": ["+quant_fp8", "+rms_norm"],'\
-'"cudagraph_mode": "FULL_DECODE_ONLY",'\
-'"splitting_ops": []'\
-'}'
-cat > config.yaml <<-EOF
-kv-cache-dtype: fp8
-compilation-config: '$FUSION_FLAG'
-async-scheduling: true
-no-enable-prefix-caching: true
-max-num-batched-tokens: 8192
-max-model-len: $MAX_MODEL_LEN
-EOF
-
-cat config.yaml  # Debugging
-
-export VLLM_FLASHINFER_ALLREDUCE_FUSION_THRESHOLDS_MB='{"2":32,"4":32,"8":8}'
-export PYTHONNOUSERSITE=1
-
-set -x
-vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
---gpu-memory-utilization=0.9 \
---tensor-parallel-size=$TP \
---max-num-seqs=512 \
---config config.yaml \
---disable-log-requests
diff --git a/benchmarks/70b_fp8_b200_trt_docker.sh b/benchmarks/70b_fp8_b200_trt_docker.sh
deleted file mode 100644
index e30478672..000000000
--- a/benchmarks/70b_fp8_b200_trt_docker.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-set -x
-# Launch TRT-LLM server
-mpirun -n 1 --allow-run-as-root --oversubscribe trtllm-serve $MODEL --tp_size $TP --trust_remote_code \
---max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT
diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh
deleted file mode 100644
index ad24453b3..000000000
--- a/benchmarks/70b_fp8_b200_trt_slurm.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-hf download $MODEL
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-
-set -x
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-# Launch TRT-LLM server
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h100_docker.sh b/benchmarks/70b_fp8_h100_docker.sh
deleted file mode 100755
index 5d8df1bac..000000000
--- a/benchmarks/70b_fp8_h100_docker.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# MAX_MODEL_LEN
-# TP
-# CONC
-
-pip install -q datasets pandas
-
-cat > config.yaml << EOF
-kv-cache-dtype: fp8
-async-scheduling: true
-no-enable-prefix-caching: true
-max-num-batched-tokens: 8192
-max-model-len: 10240
-EOF
-
-export PYTHONNOUSERSITE=1
-
-vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
---config=config.yaml \
---gpu-memory-utilization=0.9 \
---tensor-parallel-size=$TP \
---max-num-seqs=$CONC  \
---disable-log-requests
diff --git a/benchmarks/70b_fp8_h100_slurm.sh b/benchmarks/70b_fp8_h100_slurm.sh
deleted file mode 100644
index 485aa8817..000000000
--- a/benchmarks/70b_fp8_h100_slurm.sh
+++ /dev/null
@@ -1,60 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-cat > config.yaml << EOF
-kv-cache-dtype: fp8
-async-scheduling: true
-no-enable-prefix-caching: true
-max-num-batched-tokens: 8192
-max-model-len: 10240
-EOF
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-
-export TORCH_CUDA_ARCH_LIST="9.0"
-
-set -x
-PYTHONNOUSERSITE=1 vllm serve $MODEL --host=0.0.0.0 --port=$PORT \
---config=config.yaml \
---gpu-memory-utilization=0.9 \
---tensor-parallel-size=$TP \
---max-num-seqs=$CONC  \
---disable-log-requests > $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-pip install -q datasets pandas
-git clone https://github.com/kimbochen/bench_serving.git
-set -x
-python3 bench_serving/benchmark_serving.py \
---model=$MODEL \
---backend=vllm \
---base-url="http://0.0.0.0:$PORT" \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics='ttft,tpot,itl,e2el' \
---result-dir=/workspace/ \
---result-filename=$RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh
deleted file mode 100644
index 094fbd19c..000000000
--- a/benchmarks/70b_fp8_h200_slurm.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-set -x
-hf download $MODEL
-pip install datasets pandas
-
-# Calculate max-model-len based on ISL and OSL
-if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
-    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
-elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
-    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
-else
-    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
-fi
-
-# Create config.yaml
-cat > config.yaml << EOF
-kv-cache-dtype: fp8
-async-scheduling: true
-no-enable-prefix-caching: true
-max-num-batched-tokens: 8192
-max-model-len: $CALCULATED_MAX_MODEL_LEN
-EOF
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-export TORCH_CUDA_ARCH_LIST="9.0"
-
-PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
- --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC  \
- --disable-log-requests > $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh
deleted file mode 100644
index dfb2324b9..000000000
--- a/benchmarks/70b_fp8_h200_trt_slurm.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-hf download $MODEL
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
\ No newline at end of file
diff --git a/benchmarks/70b_fp8_mi300x_docker.sh b/benchmarks/70b_fp8_mi300x_docker.sh
deleted file mode 100644
index 941e95023..000000000
--- a/benchmarks/70b_fp8_mi300x_docker.sh
+++ /dev/null
@@ -1,59 +0,0 @@
-#!/usr/bin/env bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# PORT
-# TP
-# CONC
-# MAX_MODEL_LEN
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-# If the machine runs a MEC FW older than 177, RCCL
-# cannot reclaim some memory.
-# Disable that features to avoid crashes.
-# This is related to the changes in the driver at:
-# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
-if [[ "$version" == "" || $version -lt 177 ]]; then
-  export HSA_NO_SCRATCH_RECLAIM=1
-fi
-
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ "$CONC" -ge "16" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-    fi
-fi
-
-# In this specific case, float16 performs better than the datatype
-# picked by vllm when using auto for --dtype (bfloat16).
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=float16 --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests
diff --git a/benchmarks/70b_fp8_mi300x_slurm.sh b/benchmarks/70b_fp8_mi300x_slurm.sh
deleted file mode 100644
index b387505f0..000000000
--- a/benchmarks/70b_fp8_mi300x_slurm.sh
+++ /dev/null
@@ -1,92 +0,0 @@
-#!/usr/bin/bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-huggingface-cli download $MODEL
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=8888
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-# If the machine runs a MEC FW older than 177, RCCL
-# cannot reclaim some memory.
-# Disable that features to avoid crashes.
-# This is related to the changes in the driver at:
-# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
-if [[ "$version" == "" || $version -lt 177 ]]; then
-  export HSA_NO_SCRATCH_RECLAIM=1
-fi
-
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ "$CONC" -ge "16" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-    fi
-fi
-
-# In this specific case, float16 performs better than the datatype
-# picked by vllm when using auto for --dtype (bfloat16).
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=float16 --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests \
-> $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model=$MODEL --backend=vllm \
---base-url="http://0.0.0.0:$PORT" \
---dataset-name=random \
---random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \
---num-prompts=$(( $CONC * 10 )) --max-concurrency=$CONC \
---request-rate=inf --ignore-eos \
---save-result --percentile-metrics='ttft,tpot,itl,e2el' \
---result-dir=/workspace/ \
---result-filename=$RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_mi325x_docker.sh b/benchmarks/70b_fp8_mi325x_docker.sh
deleted file mode 100644
index 9e1fcdf8b..000000000
--- a/benchmarks/70b_fp8_mi325x_docker.sh
+++ /dev/null
@@ -1,53 +0,0 @@
-#!/usr/bin/env bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# PORT
-# TP
-# CONC
-# MAX_MODEL_LEN
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ "$CONC" -ge "16" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-    fi
-fi
-
-# Patch the aiter config script to deal
-# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch.
-file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py'
-sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#'  $file_to_patch
-
-
-# In this specific case, float16 performs better than the datatype
-# picked by vllm when using auto for --dtype (bfloat16).
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=float16 --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests
diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh
deleted file mode 100644
index 1febeff13..000000000
--- a/benchmarks/70b_fp8_mi325x_slurm.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-huggingface-cli download $MODEL
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-	if [[ "$CONC" -ge "16" ]]; then
-		export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-	fi
-fi
-
-# Patch the aiter config script to deal
-# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch.
-file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py'
-sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#'  $file_to_patch
-
-
-# In this specific case, float16 performs better than the datatype
-# picked by vllm when using auto for --dtype (bfloat16).
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=float16 --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests \
-> $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_mi355x_docker.sh b/benchmarks/70b_fp8_mi355x_docker.sh
deleted file mode 100644
index 6310a5f64..000000000
--- a/benchmarks/70b_fp8_mi355x_docker.sh
+++ /dev/null
@@ -1,50 +0,0 @@
-#!/usr/bin/env bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# PORT
-# TP
-# CONC
-# MAX_MODEL_LEN
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-sleep 5
-cat config.yaml
-
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ "$CONC" -ge "16" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-    fi
-fi
-
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=auto --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests
-
diff --git a/benchmarks/70b_fp8_mi355x_slurm.sh b/benchmarks/70b_fp8_mi355x_slurm.sh
deleted file mode 100644
index 2abfee137..000000000
--- a/benchmarks/70b_fp8_mi355x_slurm.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env bash
-
-# ========= Required Env Vars =========
-# HF_TOKEN
-# HF_HUB_CACHE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# PORT
-# RESULT_FILENAME
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ "$CONC" -ge "16" ]]; then
-        export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-    fi
-fi
-
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=auto --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests \
-> $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url "http://0.0.0.0:$PORT" \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics "ttft,tpot,itl,e2el" \
---result-dir /workspace/ --result-filename $RESULT_FILENAME.json
-
-exit

From be3b40f5314e4bd001dcb4a12e024813d32befc6 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 18:49:34 -0500
Subject: [PATCH 076/149] testing concurrency


From 13803ac4347461c77a72eba1746b70dbdb6af172 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 14:42:41 -0500
Subject: [PATCH 077/149] adding more workflows

---
 .github/workflows/1k8k-sweep.yml             |   5 +
 .github/workflows/test.yml                   | 147 ++++++++++++++++++
 utils/matrix-logic/get_test_sweep_configs.py | 151 +++++++++++++++++++
 3 files changed, 303 insertions(+)
 create mode 100644 .github/workflows/test.yml
 create mode 100644 utils/matrix-logic/get_test_sweep_configs.py

diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index 25fc3a362..581ec07cf 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -4,6 +4,11 @@ concurrency:
   group: benchmark-lock-1k8k
   cancel-in-progress: false
 
+on:
+  workflow_dispatch:
+  schedule:
+    - cron: '0 23 * * *'
+
 on:
     # pull_request:
     workflow_dispatch:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 000000000..0d92952da
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,147 @@
+name: Test - Full Sweep
+
+concurrency:
+    group: benchmark-lock
+    cancel-in-progress: false
+
+on:
+    pull_request:
+    workflow_dispatch:
+        inputs:
+            name:
+                description: "Name of benchmark from master configs"
+                required: true
+                type: string
+                default: 70b-fp4-mi355x-vllm
+
+            run_1k1k:
+                description: "Run ISL/OSL 1k/1k"
+                type: boolean
+                required: true
+            run_1k8k:
+                description: "Run ISL/OSL 1k/8k"
+                type: boolean
+                required: true
+            run_8k1k:
+                description: "Run ISL/OSL 8k/1k"
+                type: boolean
+                required: true
+
+            runner:
+                description: "Specific runner node to run on"
+                required: false
+                type: choice
+                options:
+                    - "h100-cr_0"
+                    - "h100-cr_1"
+                    - "h100-cw_0"
+                    - "h100-cw_1"
+                    - "h200-cw_0"
+                    - "h200-cw_1"
+                    - "h200-nb_0"
+                    - "h200-nb_1"
+                    - "h200-nb_2"
+                    - "h200-nb_3"
+                    - "h200-nv_0"
+                    - "h200-nv_1"
+                    - "h200-nv_2"
+                    - "h200-nv_3"
+                    - "b200-nv_0"
+                    - "b200-nv_1"
+                    - "b200-nb_0"
+                    - "b200-nb_1"
+                    - "b200-nvd_0"
+                    - "b200-nvd_1"
+                    - "b200-nvd_2"
+                    - "b200-nvd_3"
+                    - "b200-tg_0"
+                    - "mi300x-amd_0"
+                    - "mi300x-amd_1"
+                    - "mi300x-amd_2"
+                    - "mi300x-amd_3"
+                    - "mi300x-amd_4"
+                    - "mi300x-cr_0"
+                    - "mi300x-oci_0"
+                    - "mi325x-amd_0"
+                    - "mi325x-tw_0"
+                    - "mi325x-tw_1"
+                    - "mi325x-tw_2"
+                    - "mi325x-tw_3"
+                    - "mi355x-amd_0"
+                    - "mi355x-amd_1"
+                    - "mi355x-amd_2"
+                    - "mi355x-amd_3"
+
+jobs:
+    get-jobs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - id: get-jobs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \
+                    --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \
+                    --key ${{ inputs.name }} \
+                    ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }})
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+    test-sweep:
+        needs: get-jobs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: test sweep - ${{ inputs.name }}
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k1k"
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
+
+    calc-success-rate:
+        needs: test-sweep
+        if: ${{ always() }}
+        runs-on: ubuntu-latest
+
+        env:
+            RESULTS_DIR: "results/"
+            STATS_FILENAME: "run_stats"
+            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
+
+        steps:
+            - uses: actions/checkout@v3
+              with:
+                  token: ${{ secrets.REPO_PAT }}
+                  fetch-depth: 0
+
+            - name: Download results artifacts
+              uses: actions/download-artifact@v4
+              with:
+                  path: ${{ env.RESULTS_DIR }}
+                  pattern: results_*
+
+            - name: Install python dependencies
+              run: pip install PyGithub
+
+            - name: Calculate success rate
+              run: python3 utils/calc_success_rate.py $STATS_FILENAME
+
+            - uses: actions/upload-artifact@v4
+              with:
+                  name: "run-stats"
+                  path: ${{ env.STATS_FILENAME }}.json
diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py
new file mode 100644
index 000000000..87ab0457b
--- /dev/null
+++ b/utils/matrix-logic/get_test_sweep_configs.py
@@ -0,0 +1,151 @@
+import json
+import yaml
+import sys
+import argparse
+
+seq_len_stoi = {
+    "1k1k": (1024, 1024),
+    "1k8k": (1024, 8192),
+    "8k1k": (8192, 1024)
+}
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate benchmark matrix from a specific configuration key'
+    )
+    parser.add_argument(
+        '--config-files',
+        nargs='+',
+        required=True,
+        help='One or more configuration files (YAML format)'
+    )
+    parser.add_argument(
+        '--key',
+        required=True,
+        help='Configuration key to use'
+    )
+    parser.add_argument(
+        '--seq-lens',
+        nargs='+',
+        choices=list(seq_len_stoi.keys()),
+        required=False,
+        help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
+    )
+    parser.add_argument(
+        '--step-size',
+        type=int,
+        default=2,
+        help='Step size for concurrency values (default: 2)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Convert seq-lens to set of (isl, osl) tuples for filtering
+    seq_lens_filter = None
+    if args.seq_lens:
+        seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens}
+    
+    # Load and merge all config files
+    all_config_data = {}
+    for config_file in args.config_files:
+        try:
+            with open(config_file, 'r') as f:
+                config_data = yaml.safe_load(f)
+                assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary"
+                
+                # Check for duplicate keys
+                duplicate_keys = set(all_config_data.keys()) & set(config_data.keys())
+                if duplicate_keys:
+                    raise ValueError(
+                        f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
+                    )
+                
+                all_config_data.update(config_data)
+        except FileNotFoundError:
+            raise ValueError(f"Input file '{config_file}' does not exist.")
+    
+    # Check if the key exists
+    if args.key not in all_config_data:
+        available_keys = ', '.join(sorted(all_config_data.keys()))
+        raise ValueError(
+            f"Key '{args.key}' not found in configuration files. "
+            f"Available keys: {available_keys}"
+        )
+    
+    val = all_config_data[args.key]
+    
+    # Validate required fields
+    seq_len_configs = val.get('seq-len-configs')
+    assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'"
+    
+    image = val.get('image')
+    model = val.get('model')
+    precision = val.get('precision')
+    framework = val.get('framework')
+    runner = val.get('runner')
+    
+    assert None not in (image, model, precision, framework, runner), \
+        f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'"
+    
+    matrix_values = []
+    
+    # Process each sequence length configuration
+    for seq_config in seq_len_configs:
+        isl = seq_config.get('isl')
+        osl = seq_config.get('osl')
+        
+        assert None not in (isl, osl), \
+            f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'"
+        
+        # Filter by sequence lengths if specified
+        if seq_lens_filter and (isl, osl) not in seq_lens_filter:
+            continue
+        
+        bmk_space = seq_config.get('bmk-space')
+        assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'"
+        
+        for bmk in bmk_space:
+            tp = bmk.get('tp')
+            conc_start = bmk.get('conc-start')
+            conc_end = bmk.get('conc-end')
+            ep = bmk.get('ep')
+            dp_attn = bmk.get('dp-attn')
+            
+            assert None not in (tp, conc_start, conc_end), \
+                f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'"
+            
+            # Generate entries for each concurrency value in the range
+            conc = conc_start
+            while conc <= conc_end:
+                entry = {
+                    'image': image,
+                    'model': model,
+                    'precision': precision,
+                    'framework': framework,
+                    'runner': runner,
+                    'isl': isl,
+                    'osl': osl,
+                    'tp': tp,
+                    'conc': conc,
+                    'max-model-len': isl + osl,
+                }
+                
+                # Add optional fields if they exist
+                if ep is not None:
+                    entry['ep'] = ep
+                if dp_attn is not None:
+                    entry['dp-attn'] = dp_attn
+                
+                matrix_values.append(entry)
+                
+                if conc == conc_end:
+                    break
+                conc *= args.step_size
+                if conc > conc_end:
+                    conc = conc_end
+    
+    print(json.dumps(matrix_values))
+    return matrix_values
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 422e5b86a01700f0f346b319fdba01631b4314bd Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 17:05:27 -0500
Subject: [PATCH 078/149] deleting files

---
 .github/workflows/full-sweep-tmpl.yml         | 188 ----------
 .github/workflows/runner-model-sweep-test.yml | 289 ---------------
 .github/workflows/runner-sweep-test.yml       | 328 ------------------
 .github/workflows/test.yml                    | 147 --------
 4 files changed, 952 deletions(-)
 delete mode 100644 .github/workflows/full-sweep-tmpl.yml
 delete mode 100644 .github/workflows/runner-model-sweep-test.yml
 delete mode 100644 .github/workflows/runner-sweep-test.yml
 delete mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml
deleted file mode 100644
index 869928cb7..000000000
--- a/.github/workflows/full-sweep-tmpl.yml
+++ /dev/null
@@ -1,188 +0,0 @@
-name: Template - Full Sweep
-
-on:
-  workflow_call:
-    inputs:
-      run_1k1k:
-        type: boolean
-        required: true
-      run_8k1k:
-        type: boolean
-        required: true
-      run_1k8k:
-        type: boolean
-        required: true
-
-      use_h100:
-        type: boolean
-        required: true
-      use_h200:
-        type: boolean
-        required: true
-      use_b200:
-        type: boolean
-        required: true
-      use_mi300x:
-        type: boolean
-        required: true
-      use_mi325x:
-        type: boolean
-        required: true
-      use_mi355x:
-        type: boolean
-        required: true
-      use_gb200:
-        type: boolean
-        required: false
-        default: false
-
-jobs:
-  dsr1-1k1k:
-    if: ${{ inputs.run_1k1k }}
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-      use_gb200: ${{ inputs.use_gb200 }}
-
-  collect-dsr1-1k1k-results:
-    needs: dsr1-1k1k
-    if: ${{ inputs.run_1k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k1k'
-
-  gptoss-1k1k:
-    if: ${{ inputs.run_1k1k }}
-    uses: ./.github/workflows/gptoss-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-
-  collect-gptoss-1k1k-results:
-    needs: gptoss-1k1k
-    if: ${{ inputs.run_1k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_1k1k'
-
-  dsr1-8k1k:
-    if: ${{ inputs.run_8k1k }}
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-      use_gb200: ${{ inputs.use_gb200 }}
-
-  collect-dsr1-8k1k-results:
-    needs: dsr1-8k1k
-    if: ${{ inputs.run_8k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_8k1k'
-
-  gptoss-8k1k:
-    if: ${{ inputs.run_8k1k }}
-    uses: ./.github/workflows/gptoss-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-
-  collect-gptoss-8k1k-results:
-    needs: gptoss-8k1k
-    if: ${{ inputs.run_8k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_8k1k'
-
-  dsr1-1k8k:
-    if: ${{ inputs.run_1k8k }}
-    uses: ./.github/workflows/dsr1-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-      use_gb200: ${{ inputs.use_gb200 }}
-
-  collect-dsr1-1k8k-results:
-    needs: dsr1-1k8k
-    if: ${{ inputs.run_1k8k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'dsr1_1k8k'
-
-  gptoss-1k8k:
-    if: ${{ inputs.run_1k8k }}
-    uses: ./.github/workflows/gptoss-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_1k8k'
-      isl: 1024
-      osl: 8192
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
-
-  collect-gptoss-1k8k-results:
-    needs: gptoss-1k8k
-    if: ${{ inputs.run_1k8k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_1k8k'
diff --git a/.github/workflows/runner-model-sweep-test.yml b/.github/workflows/runner-model-sweep-test.yml
deleted file mode 100644
index e4f2b7303..000000000
--- a/.github/workflows/runner-model-sweep-test.yml
+++ /dev/null
@@ -1,289 +0,0 @@
-name: 'Test - Runner Model Sweep'
-run-name: '${{ github.event.inputs.runner }} Sweep'
-on:
-  workflow_dispatch:
-    inputs:
-      runner:
-        description: 'Runner Type'
-        required: true
-        type: choice
-        options:
-          - 'h100'
-          - 'h200'
-          - 'h200-trt'
-          - 'b200'
-          - 'b200-trt'
-          - 'mi300x'
-          - 'mi325x'
-          - 'mi355x'
-
-env:
-  HF_TOKEN: ${{ secrets.HF_TOKEN }}
-  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
-
-jobs:
-  bmk-h100:
-    if: ${{ inputs.runner == 'h100' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'h100-cr_0'
-          - 'h100-cr_1'
-          - 'h100-cw_0'
-          - 'h100-cw_1'
-        config:
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk-h200:
-    if: ${{ inputs.runner == 'h200' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'h200-cw_0'
-          - 'h200-cw_1'
-          - 'h200-nb_0'
-          - 'h200-nb_1'
-          - 'h200-nb_2'
-          - 'h200-nb_3'
-          - 'h200-nv_0'
-          - 'h200-nv_1'
-          - 'h200-nv_2'
-          - 'h200-nv_3'
-        config:
-          - { image: 'lmsysorg/sglang:v0.5.2rc2-cu126', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk-h200-trt:
-    if: ${{ inputs.runner == 'h200-trt' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'h200-cw_0'
-          - 'h200-cw_1'
-          - 'h200-nb_0'
-          - 'h200-nb_1'
-          - 'h200-nb_2'
-          - 'h200-nb_3'
-          - 'h200-nv_0'
-          - 'h200-nv_1'
-          - 'h200-nv_2'
-          - 'h200-nv_3'
-        config:
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk-b200:
-    if: ${{ inputs.runner == 'b200' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'b200-nvd_0'
-          - 'b200-nvd_1'
-          - 'b200-nvd_2'
-          - 'b200-nvd_3'
-        config:
-          - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'lmsysorg/sglang:v0.5.3rc1-cu129-b200', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
-          - { image: 'vllm/vllm-openai:v0.10.2', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[4]'
-
-  bmk-b200-trt:
-    if: ${{ inputs.runner == 'b200-trt' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'b200-nv_0'
-          - 'b200-nv_1'
-          - 'b200-nb_0'
-          - 'b200-nb_1'
-        config:
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'trt', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'nvidia/DeepSeek-R1-0528-FP4', framework: 'trt', precision: 'fp4', exp-name: 'dsr1_test' }
-          - { image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2', model: 'openai/gpt-oss-120b', framework: 'trt', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk-mi300x:
-    if: ${{ inputs.runner == 'mi300x' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'mi300x-amd_0'
-          - 'mi300x-amd_1'
-          - 'mi300x-amd_2'
-          - 'mi300x-amd_3'
-          - 'mi300x-amd_4'
-          - 'mi300x-cr_0'
-          - 'mi300x-oci_0'
-        config:
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk-mi325x:
-    if: ${{ inputs.runner == 'mi325x' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'mi325x-amd_0'
-          - 'mi325x-tw_0'
-          - 'mi325x-tw_1'
-          - 'mi325x-tw_2'
-          - 'mi325x-tw_3'
-        config:
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk-mi355x:
-    if: ${{ inputs.runner == 'mi355x' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'mi355x-amd_0'
-          - 'mi355x-amd_1'
-          - 'mi355x-amd_2'
-          - 'mi355x-amd_3'
-        config:
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
-          - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ matrix.config.image }}
-      model: ${{ matrix.config.model }}
-      framework: ${{ matrix.config.framework }}
-      precision: ${{ matrix.config.precision }}
-      exp-name: ${{ matrix.config.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
diff --git a/.github/workflows/runner-sweep-test.yml b/.github/workflows/runner-sweep-test.yml
deleted file mode 100644
index 8f824c4d1..000000000
--- a/.github/workflows/runner-sweep-test.yml
+++ /dev/null
@@ -1,328 +0,0 @@
-name: 'Test - Runner Sweep'
-run-name: '${{ github.event.inputs.runner }} Sweep - ${{ github.event.inputs.model }}'
-on:
-  workflow_dispatch:
-    inputs:
-      runner:
-        description: 'Runner Type'
-        required: true
-        type: choice
-        options:
-          - 'h100'
-          - 'h200'
-          - 'b200'
-          - 'h200-trt'
-          - 'b200-trt'
-          - 'mi300x'
-          - 'mi325x'
-          - 'mi355x'
-          - 'gb200'
-
-      image:
-        description: 'Docker Image'
-        required: true
-        type: choice
-        options:
-          - 'lmsysorg/sglang:v0.4.9.post1-cu126'
-          - 'lmsysorg/sglang:v0.5.0rc1-cu128-b200'
-          - 'lmsysorg/sglang:v0.5.2rc2-cu126'
-          - 'lmsysorg/sglang:v0.5.3rc1-cu129-b200'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post1'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1'
-          - 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev'
-          - 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3'
-          - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915'
-          - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915'
-          - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250915'
-          - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
-          - 'vllm/vllm-openai:v0.10.2'
-
-      model:
-        description: 'Model'
-        required: true
-        type: choice
-        options:
-          - 'amd/DeepSeek-R1-0528-MXFP4-Preview'
-          - 'deepseek-ai/DeepSeek-R1-0528'
-          - 'nvidia/DeepSeek-R1-0528-FP4'
-          - 'nvidia/DeepSeek-R1-0528-FP4-v2'
-          - 'openai/gpt-oss-120b'
-
-      framework:
-        description: 'Framework'
-        required: true
-        type: choice
-        options:
-          - 'vllm'
-          - 'sglang'
-          - 'trt'
-
-      precision:
-        description: 'Precision'
-        required: true
-        type: choice
-        options:
-          - 'fp8'
-          - 'fp4'
-
-      exp-name:
-        description: 'Experiment Name'
-        required: true
-        type: choice
-        options:
-          - 'dsr1_test'
-          - 'gptoss_test'
-
-
-env:
-  HF_TOKEN: ${{ secrets.HF_TOKEN }}
-  HF_HUB_CACHE: '/mnt/hf_hub_cache/'
-
-jobs:
-  bmk_h100:
-    if: ${{ inputs.runner == 'h100' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'h100-cr_0'
-          - 'h100-cr_1'
-          - 'h100-cw_0'
-          - 'h100-cw_1'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk_h200:
-    if: ${{ inputs.runner == 'h200' || inputs.runner == 'h200-trt' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'h200-cw_0'
-          - 'h200-cw_1'
-          - 'h200-nb_0'
-          - 'h200-nb_1'
-          - 'h200-nb_2'
-          - 'h200-nb_3'
-          - 'h200-nv_0'
-          - 'h200-nv_1'
-          - 'h200-nv_2'
-          - 'h200-nv_3'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[4]'
-      conc-list: '[64]'
-
-  bmk_b200:
-    if: ${{ inputs.runner == 'b200' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'b200-nv_0'
-          - 'b200-nv_1'
-          - 'b200-nvd_0'
-          - 'b200-nvd_1'
-          - 'b200-tg_0'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk_b200-trt:
-    if: ${{ inputs.runner == 'b200-trt' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'b200-nv_0'
-          - 'b200-nv_1'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk_mi300x:
-    if: ${{ inputs.runner == 'mi300x' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'mi300x-amd_0'
-          - 'mi300x-amd_1'
-          - 'mi300x-amd_2'
-          - 'mi300x-amd_3'
-          - 'mi300x-amd_4'
-          - 'mi300x-cr_0'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk_mi325x:
-    if: ${{ inputs.runner == 'mi325x' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'mi325x-amd_0'
-          - 'mi325x-tw_0'
-          - 'mi325x-tw_1'
-          - 'mi325x-tw_2'
-          - 'mi325x-tw_3'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk_mi355x:
-    if: ${{ inputs.runner == 'mi355x' }}
-    strategy:
-      fail-fast: false
-      matrix:
-        runner:
-          - 'mi355x-amd_0'
-          - 'mi355x-amd_1'
-          - 'mi355x-amd_2'
-          - 'mi355x-amd_3'
-
-    name: '${{ matrix.runner }}'
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: ${{ matrix.runner }}
-      image: ${{ inputs.image }}
-      model: ${{ inputs.model }}
-      framework: ${{ inputs.framework }}
-      precision: ${{ inputs.precision }}
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      tp-list: '[8]'
-      conc-list: '[1]'
-
-  bmk_gb200:
-    if: ${{ inputs.runner == 'gb200' && inputs.framework == 'trt' }}
-    uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-    secrets: inherit
-    with:
-      runner: gb200
-      image: 'nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3'
-      model: 'deepseek-r1-fp4'
-      framework: 'dynamo-trtllm'
-      precision: 'fp4'
-      exp-name: ${{ inputs.exp-name }}
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      mtp-mode: 'off'
-
-  bmk_gb200-sgl:
-    if: ${{ inputs.runner == 'gb200' && inputs.framework == 'sglang' }}
-    uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-    secrets: inherit
-    with:
-      runner: gb200
-      image: 'nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      framework: 'dynamo-sglang'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: 8192
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      mtp-mode: 'off'
-
-  collect-test-results:
-    needs: [ bmk_h100, bmk_h200, bmk_b200, bmk_b200-trt, bmk_mi300x, bmk_mi325x, bmk_mi355x, bmk_gb200, bmk_gb200-sgl ]
-    if: ${{ always() && !cancelled() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: ${{ inputs.exp-name }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
deleted file mode 100644
index 0d92952da..000000000
--- a/.github/workflows/test.yml
+++ /dev/null
@@ -1,147 +0,0 @@
-name: Test - Full Sweep
-
-concurrency:
-    group: benchmark-lock
-    cancel-in-progress: false
-
-on:
-    pull_request:
-    workflow_dispatch:
-        inputs:
-            name:
-                description: "Name of benchmark from master configs"
-                required: true
-                type: string
-                default: 70b-fp4-mi355x-vllm
-
-            run_1k1k:
-                description: "Run ISL/OSL 1k/1k"
-                type: boolean
-                required: true
-            run_1k8k:
-                description: "Run ISL/OSL 1k/8k"
-                type: boolean
-                required: true
-            run_8k1k:
-                description: "Run ISL/OSL 8k/1k"
-                type: boolean
-                required: true
-
-            runner:
-                description: "Specific runner node to run on"
-                required: false
-                type: choice
-                options:
-                    - "h100-cr_0"
-                    - "h100-cr_1"
-                    - "h100-cw_0"
-                    - "h100-cw_1"
-                    - "h200-cw_0"
-                    - "h200-cw_1"
-                    - "h200-nb_0"
-                    - "h200-nb_1"
-                    - "h200-nb_2"
-                    - "h200-nb_3"
-                    - "h200-nv_0"
-                    - "h200-nv_1"
-                    - "h200-nv_2"
-                    - "h200-nv_3"
-                    - "b200-nv_0"
-                    - "b200-nv_1"
-                    - "b200-nb_0"
-                    - "b200-nb_1"
-                    - "b200-nvd_0"
-                    - "b200-nvd_1"
-                    - "b200-nvd_2"
-                    - "b200-nvd_3"
-                    - "b200-tg_0"
-                    - "mi300x-amd_0"
-                    - "mi300x-amd_1"
-                    - "mi300x-amd_2"
-                    - "mi300x-amd_3"
-                    - "mi300x-amd_4"
-                    - "mi300x-cr_0"
-                    - "mi300x-oci_0"
-                    - "mi325x-amd_0"
-                    - "mi325x-tw_0"
-                    - "mi325x-tw_1"
-                    - "mi325x-tw_2"
-                    - "mi325x-tw_3"
-                    - "mi355x-amd_0"
-                    - "mi355x-amd_1"
-                    - "mi355x-amd_2"
-                    - "mi355x-amd_3"
-
-jobs:
-    get-jobs:
-        runs-on: ubuntu-latest
-        outputs:
-            search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - id: get-jobs
-              run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \
-                    --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \
-                    --key ${{ inputs.name }} \
-                    ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }})
-                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
-
-    test-sweep:
-        needs: get-jobs
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: test sweep - ${{ inputs.name }}
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
-        secrets: inherit
-        with:
-            exp-name: "dsr1_1k1k"
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
-            conc: ${{ matrix.config.conc }}
-
-    calc-success-rate:
-        needs: test-sweep
-        if: ${{ always() }}
-        runs-on: ubuntu-latest
-
-        env:
-            RESULTS_DIR: "results/"
-            STATS_FILENAME: "run_stats"
-            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
-
-        steps:
-            - uses: actions/checkout@v3
-              with:
-                  token: ${{ secrets.REPO_PAT }}
-                  fetch-depth: 0
-
-            - name: Download results artifacts
-              uses: actions/download-artifact@v4
-              with:
-                  path: ${{ env.RESULTS_DIR }}
-                  pattern: results_*
-
-            - name: Install python dependencies
-              run: pip install PyGithub
-
-            - name: Calculate success rate
-              run: python3 utils/calc_success_rate.py $STATS_FILENAME
-
-            - uses: actions/upload-artifact@v4
-              with:
-                  name: "run-stats"
-                  path: ${{ env.STATS_FILENAME }}.json

From 2d1e45763befe5b095ac197fdbabf6d8aab82a2c Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 09:20:47 -0500
Subject: [PATCH 079/149] cleaning up after rebase

---
 .github/workflows/1k8k-sweep.yml | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index 581ec07cf..25fc3a362 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -4,11 +4,6 @@ concurrency:
   group: benchmark-lock-1k8k
   cancel-in-progress: false
 
-on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 23 * * *'
-
 on:
     # pull_request:
     workflow_dispatch:

From 534d98c2ba3e535a6341b824272313751a07699c Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 11:38:01 -0500
Subject: [PATCH 080/149] adding docs for configs; adding field to configs

---
 .github/configs/CONFIGS.md                    | 52 +++++++++++++++++++
 .github/configs/amd-master.yaml               | 15 ++++--
 .github/configs/nvidia-master.yaml            | 17 ++++--
 utils/matrix-logic/generate_sweep_configs.py  | 19 ++++---
 .../test_generate_sweep_configs.py            | 24 +++++++++
 5 files changed, 110 insertions(+), 17 deletions(-)
 create mode 100644 .github/configs/CONFIGS.md

diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md
new file mode 100644
index 000000000..218e17821
--- /dev/null
+++ b/.github/configs/CONFIGS.md
@@ -0,0 +1,52 @@
+# Configs
+
+The config files in this directory are meant to be a "source of truth" for what benchmark configurations can/should be run. As such, they must follow a precise format which is described below.
+
+## Master Configs (AMD, NVIDIA, etc.)
+
+```yaml
+entry-name:
+  image: string
+  model: string
+  model-prefix: string
+  runner: string
+  precision: string
+  framework: string
+  seq-len-configs:
+  - isl: int
+    osl: int
+    search-space:
+    - { tp: int, conc-start: int, conc-end: int }
+    # Optionally, specify 'ep' (expert-parallelism) and 'dp-attn' (data parallel attention)
+    - { tp: int, ep: int, dp-attn: bool, conc-start: int, conc-end: int }
+    - ...
+  - ...
+```
+Note: while not required, `entry-name` typically takes the format `<INFMAX_MODEL_PREFIX>-<PRECISION>-<GPU>-<FRAMEWORK>`.
+
+The below list describes what each field is:
+
+- `image`: The image used to serve the benchmark, e.g., `vllm/vllm-openai:v0.10.2`
+- `model`: The model to server, e.g., `openai/gpt-oss-120b`
+- `model-prefix`: The canonical InferenceMAX model prefix reference, i.e., `dsr1` for Deepseek, `gptoss` for gptoss-120b, etc. This value is used to decipher which script in `benchmarks/` should be used in order to launch the benchmark.
+- `runner`: This is the runner on which to run the benchmark. This must be a valid runner (key or value) from `runners.yaml`.
+- `precision`: The precision to run the benchmark. Again, this is used to find which script to run in `benchmarks/`.
+- `framework`: The framework (serving runtime) to serve the benchmark, e.g., `vllm`, `sglang`, `trt`.
+- `seq-len-configs`: A list of possible sequence lengths to benchmark. Each entry must have the following fields:
+  - `isl`: An integer representing the input sequence length, e.g., `1024`
+  - `osl`: An integer representing the output sequence length, e.g., `8192`
+  - `search-space`: A list of configurations to run with respective `isl` and `osl`, each entry must be a dict with the following fields:
+    - `tp`: An integer representing the tensor parallelism level that the configuration will be served at.
+    - `conc-start`: An integer representing the starting level of concurrency e.g., `4`
+    - `conc-end`: An integer representing the ending level of concurrency (inclusive) e.g., `128`
+    - Note: the step factor between `conc-start` and `conc-end` is 2, so if `conc-start` is 4 and `conc-end` is 128, all concurrencies `4, 8, 16, 32, ..., 128` will be run.
+    - (Optional) `ep`: An integer representing the expert parallelism level that the configuration will be served at. Default is 1 (no expert parallelism) when not specified.
+    - (Optional) `dp-attn`: A boolean representing whether or not to activate data parallel attention for the configuration. Default is false when not specified.
+
+Notes:
+- No extra fields besides the ones listed may be specified, or else the benchmarks will fail to run.
+- Setting the fields above, particularly `ep` and `dp-attn`, only guarantee that the respective values will be passed as environment variables to the benchmark scripts! Actually using those environment variables is an implementation detail at the level of the benchmark Bash script.
+
+## Runners
+
+The `runners.yaml` config represents the available runners in the repository. The keys are the runner *types* (i.e., the GPUs as well as some specific combinations like `h200-trt`) whereas the value is a list of *runner nodes*. This config is used to verify the master configs.
diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 55086d443..d9558f284 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1,6 +1,7 @@
-dsr1-fp4-mi355x-sgl:
+dsr1-fp4-mi355x-sglang:
   image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915
   model: amd/DeepSeek-R1-0528-MXFP4-Preview
+  model-prefix: dsr1
   runner: mi355x
   precision: fp4
   framework: sglang
@@ -19,9 +20,10 @@ dsr1-fp4-mi355x-sgl:
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
-dsr1-fp8-mi300x-sgl:
+dsr1-fp8-mi300x-sglang:
   image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915
   model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
   runner: mi300x
   precision: fp8
   framework: sglang
@@ -39,9 +41,10 @@ dsr1-fp8-mi300x-sgl:
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
-dsr1-fp8-mi325x-sgl:
+dsr1-fp8-mi325x-sglang:
   image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915
   model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
   runner: mi325x
   precision: fp8
   framework: sglang
@@ -59,9 +62,10 @@ dsr1-fp8-mi325x-sgl:
     search-space:
     - { tp: 8, conc-start: 4, conc-end: 64 }
 
-dsr1-fp8-mi355x-sgl:
+dsr1-fp8-mi355x-sglang:
   image: rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915
   model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
   runner: mi355x
   precision: fp8
   framework: sglang
@@ -82,6 +86,7 @@ dsr1-fp8-mi355x-sgl:
 gptoss-fp4-mi300x-vllm:
   image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
   model: openai/gpt-oss-120b
+  model-prefix: gptoss
   runner: mi300x
   precision: fp4
   framework: vllm
@@ -111,6 +116,7 @@ gptoss-fp4-mi300x-vllm:
 gptoss-fp4-mi325x-vllm:
   image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
   model: openai/gpt-oss-120b
+  model-prefix: gptoss
   runner: mi325x
   precision: fp4
   framework: vllm
@@ -140,6 +146,7 @@ gptoss-fp4-mi325x-vllm:
 gptoss-fp4-mi355x-vllm:
   image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1
   model: openai/gpt-oss-120b
+  model-prefix: gptoss
   runner: mi355x
   precision: fp4
   framework: vllm
diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 9da1cd0f9..92dfb5bbd 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1,6 +1,7 @@
-dsr1-fp4-b200-sgl:
+dsr1-fp4-b200-sglang:
   image: lmsysorg/sglang:v0.5.3rc1-cu129-b200
   model: nvidia/DeepSeek-R1-0528-FP4-V2
+  model-prefix: dsr1
   runner: b200
   precision: fp4
   framework: sglang
@@ -24,6 +25,7 @@ dsr1-fp4-b200-sgl:
 dsr1-fp4-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
   model: nvidia/DeepSeek-R1-0528-FP4-V2
+  model-prefix: dsr1
   runner: b200-trt
   precision: fp4
   framework: trt
@@ -70,9 +72,10 @@ dsr1-fp4-b200-trt:
     - { tp: 8, conc-start: 4, conc-end: 32 }
     - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 }
 
-dsr1-fp8-b200-sgl:
+dsr1-fp8-b200-sglang:
   image: lmsysorg/sglang:v0.5.3rc1-cu129-b200
   model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
   runner: b200
   precision: fp8
   framework: sglang
@@ -93,6 +96,7 @@ dsr1-fp8-b200-sgl:
 dsr1-fp8-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
   model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
   runner: b200-trt
   precision: fp8
   framework: trt
@@ -115,9 +119,10 @@ dsr1-fp8-b200-trt:
     # If CONC > 64, then DP_ATTN=true
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
 
-dsr1-fp8-h200-sgl:
+dsr1-fp8-h200-sglang:
   image: lmsysorg/sglang:v0.5.2rc2-cu126
   model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
   runner: h200
   precision: fp8
   framework: sglang
@@ -138,6 +143,7 @@ dsr1-fp8-h200-sgl:
 dsr1-fp8-h200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2
   model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
   runner: h200-trt
   precision: fp8
   framework: trt
@@ -163,6 +169,7 @@ dsr1-fp8-h200-trt:
 gptoss-fp4-b200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1
   model: openai/gpt-oss-120b
+  model-prefix: gptoss
   runner: b200-nvs
   precision: fp4
   framework: trt
@@ -193,6 +200,7 @@ gptoss-fp4-b200-trt:
 gptoss-fp4-b200-vllm:
   image: vllm/vllm-openai:v0.10.2
   model: openai/gpt-oss-120b
+  model-prefix: gptoss
   runner: b200
   precision: fp4
   framework: vllm
@@ -222,6 +230,7 @@ gptoss-fp4-b200-vllm:
 gptoss-fp4-h100-vllm:
   image: vllm/vllm-openai:v0.10.2
   model: openai/gpt-oss-120b
+  model-prefix: gptoss
   runner: h100
   precision: fp4
   framework: vllm
@@ -248,6 +257,7 @@ gptoss-fp4-h100-vllm:
 gptoss-fp4-h200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev
   model: openai/gpt-oss-120b
+  model-prefix: gptoss
   runner: h200-trt
   precision: fp4
   framework: trt
@@ -277,6 +287,7 @@ gptoss-fp4-h200-trt:
 gptoss-fp4-h200-vllm:
   image: vllm/vllm-openai:v0.10.2
   model: openai/gpt-oss-120b
+  model-prefix: gptoss
   runner: h200
   precision: fp4
   framework: vllm
diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index 7574579af..a768554e1 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -67,6 +67,7 @@ def validate_master_configs_structure(all_config_data):
         required_fields = {
             'image': str,
             'model': str,
+            'model-prefix': str,
             'precision': str,
             'framework': str,
             'runner': str,
@@ -202,7 +203,7 @@ def generate_full_sweep(args, all_config_data):
         precision = val['precision']
         framework = val['framework']
         runner = val['runner']
-        model_code = key.split('-')[0]
+        model_code = val['model-prefix']
 
         for seq_config in seq_len_configs:
             isl = seq_config['isl']
@@ -315,15 +316,15 @@ def generate_test_config(args, all_config_data):
         raise ValueError(
             f"Runner config file '{args.runner_config}' does not exist.")
 
-    # Extract model code from config key
-    model_code = args.key.split('-')[0]
-
     val = all_config_data.get(args.key)
 
     if not val:
         raise ValueError(
             f"Specified key '{args.key}' does not exist in config files.")
 
+    # Extract model code from config
+    model_code = val['model-prefix']
+
     runner_nodes = runner_config.get(val['runner'], [])
     if args.runner_node not in runner_nodes:
         raise ValueError(
@@ -447,9 +448,8 @@ def generate_runner_model_sweep_config(args, all_config_data):
         if val['runner'] != args.runner_type:
             continue
 
-        # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name
-        # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME
-        model_code = key.split('-')[0]
+        # Get model code for exp_name
+        model_code = val['model-prefix']
 
         # Find 1k1k config
         target_config = None
@@ -561,9 +561,8 @@ def generate_runner_sweep_config(args, all_config_data):
         if (args.precision and val['precision'] != args.precision) or (args.framework and val['framework'] != args.framework):
             continue
 
-        # I.e., for 70b-fp4-... the model_code is 70b which is necessary for exp_name
-        # so that it can be bubbled down to bash script benchmarks... this is probably a FIXME
-        model_code = key.split('-')[0]
+        # Get model code for exp_name
+        model_code = val['model-prefix']
 
         runner_nodes = runner_config.get(val['runner'])
         if not runner_nodes:
diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py
index 36cb14cd7..cd5ff5b46 100644
--- a/utils/matrix-logic/test_generate_sweep_configs.py
+++ b/utils/matrix-logic/test_generate_sweep_configs.py
@@ -24,6 +24,7 @@ def sample_master_config():
         "70b-fp8-vllm": {
             "image": "vllm/vllm-openai:v0.10.2",
             "model": "meta-llama/Llama-3-70b",
+            "model-prefix": "70b",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -48,6 +49,7 @@ def sample_master_config():
         "8b-fp4-trt": {
             "image": "nvcr.io/nvidia/tritonserver:24.01",
             "model": "meta-llama/Llama-3-8b",
+            "model-prefix": "8b",
             "precision": "fp4",
             "framework": "trt",
             "runner": "h100",
@@ -64,6 +66,7 @@ def sample_master_config():
         "gptoss-120b-fp8-vllm": {
             "image": "vllm/vllm-openai:latest",
             "model": "openai/gpt-oss-120b",
+            "model-prefix": "gptoss",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200-trt",
@@ -112,6 +115,7 @@ def invalid_master_config():
         "missing-field": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             # Missing precision, framework, runner, seq-len-configs
         }
     }
@@ -294,6 +298,7 @@ def test_validate_master_configs_structure_missing_field():
     config = {
         "test-key": {
             "image": "test:latest",
+            "model-prefix": "test",
             # Missing other required fields
         }
     }
@@ -307,6 +312,7 @@ def test_validate_master_configs_structure_wrong_type():
         "test-key": {
             "image": 123,  # Should be string
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -323,6 +329,7 @@ def test_validate_master_configs_structure_empty_seq_len_configs():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -339,6 +346,7 @@ def test_validate_master_configs_structure_invalid_search_space():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -363,6 +371,7 @@ def test_validate_master_configs_structure_missing_search_space():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -385,6 +394,7 @@ def test_validate_master_configs_structure_search_space_not_list():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -407,6 +417,7 @@ def test_validate_master_configs_structure_extra_fields_in_search_space():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -436,6 +447,7 @@ def test_validate_master_configs_structure_missing_isl():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -457,6 +469,7 @@ def test_validate_master_configs_structure_wrong_isl_type():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -479,6 +492,7 @@ def test_validate_master_configs_structure_missing_osl():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -500,6 +514,7 @@ def test_validate_master_configs_structure_wrong_osl_type():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -522,6 +537,7 @@ def test_validate_master_configs_structure_wrong_tp_type():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -544,6 +560,7 @@ def test_validate_master_configs_structure_wrong_conc_start_type():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -566,6 +583,7 @@ def test_validate_master_configs_structure_wrong_conc_end_type():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -588,6 +606,7 @@ def test_validate_master_configs_structure_wrong_ep_type():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -610,6 +629,7 @@ def test_validate_master_configs_structure_wrong_dp_attn_type():
         "test-key": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -784,6 +804,7 @@ def test_generate_full_sweep_seq_len_not_in_config(temp_config_files):
         "test-fp8-vllm": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -822,6 +843,7 @@ def test_generate_full_sweep_concurrency_overshoot(temp_config_files):
         "test-fp8-vllm": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -1010,6 +1032,7 @@ def test_generate_full_sweep_with_filters_concurrency_overshoot(temp_config_file
         "test-fp8-vllm": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",
@@ -1146,6 +1169,7 @@ def test_generate_test_config_concurrency_overshoot(temp_config_files):
         "test-fp8-vllm": {
             "image": "test:latest",
             "model": "test/model",
+            "model-prefix": "test",
             "precision": "fp8",
             "framework": "vllm",
             "runner": "h200",

From e21692059db837348c7dafbbfbccf42a5f420a20 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 11:39:05 -0500
Subject: [PATCH 081/149] hash on dpa too

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 4fb327381..5a3ebfae4 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -67,7 +67,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 180
-    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} conc${{ inputs.conc }}'
+    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} dpa${{ inputs.dp-attn }} conc${{ inputs.conc }}'
     steps:
       - name: Resource cleanup
         run: |

From 751d092031738e7f407862e0c092ff196f60eec8 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 11:41:01 -0500
Subject: [PATCH 082/149] debug

---
 .github/workflows/e2e-tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index ff7ecb92b..5cdc94a5e 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -5,6 +5,7 @@ name: End-to-End Tests
 #     cancel-in-progress: false
 
 on:
+    pull_request:
     workflow_dispatch:
         inputs:
             generate-cli-command:

From d4f57a787a85ac83ebf63ed17a8ebf8e402cd66d Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 11:44:22 -0500
Subject: [PATCH 083/149] debug

---
 .github/workflows/e2e-tests.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 5cdc94a5e..411c19a6f 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -10,8 +10,9 @@ on:
         inputs:
             generate-cli-command:
                 description: "Command passed to generate matrix script"
-                required: true
+                required: false
                 type: string
+                default: "filtered-sweep --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type h200 h200-trt"
 
 jobs:
     get-jobs:

From 825aa7e5faf4cf07a6629d836e870592e0017a66 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 11:46:06 -0500
Subject: [PATCH 084/149] debug

---
 .github/workflows/e2e-tests.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 411c19a6f..404447b54 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -6,6 +6,9 @@ name: End-to-End Tests
 
 on:
     pull_request:
+    push:
+        branches:
+        - initial-refactor
     workflow_dispatch:
         inputs:
             generate-cli-command:

From 232b33bb8a53fc2815f16760e17f8caa879a3a39 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 11:46:58 -0500
Subject: [PATCH 085/149] debug

---
 .github/workflows/e2e-tests.yml | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index 404447b54..ff7ecb92b 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -5,17 +5,12 @@ name: End-to-End Tests
 #     cancel-in-progress: false
 
 on:
-    pull_request:
-    push:
-        branches:
-        - initial-refactor
     workflow_dispatch:
         inputs:
             generate-cli-command:
                 description: "Command passed to generate matrix script"
-                required: false
+                required: true
                 type: string
-                default: "filtered-sweep --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type h200 h200-trt"
 
 jobs:
     get-jobs:

From d2d025ecfdab16c31f21c5ab41fe67cc996448b8 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 11:50:15 -0500
Subject: [PATCH 086/149] update hashing

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 5a3ebfae4..571b39888 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -67,7 +67,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     timeout-minutes: 180
-    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ inputs.tp }} ep${{ inputs.ep }} dpa${{ inputs.dp-attn }} conc${{ inputs.conc }}'
+    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp=${{ inputs.tp }} ep=${{ inputs.ep }} dpa=${{ inputs.dp-attn }} conc=${{ inputs.conc }}'
     steps:
       - name: Resource cleanup
         run: |

From e95af112be3c0dfd723b393eb0f193076726072a Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 11:53:49 -0500
Subject: [PATCH 087/149] deleting extraneous file

---
 utils/matrix-logic/get_test_sweep_configs.py | 151 -------------------
 1 file changed, 151 deletions(-)
 delete mode 100644 utils/matrix-logic/get_test_sweep_configs.py

diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py
deleted file mode 100644
index 87ab0457b..000000000
--- a/utils/matrix-logic/get_test_sweep_configs.py
+++ /dev/null
@@ -1,151 +0,0 @@
-import json
-import yaml
-import sys
-import argparse
-
-seq_len_stoi = {
-    "1k1k": (1024, 1024),
-    "1k8k": (1024, 8192),
-    "8k1k": (8192, 1024)
-}
-
-def main():
-    parser = argparse.ArgumentParser(
-        description='Generate benchmark matrix from a specific configuration key'
-    )
-    parser.add_argument(
-        '--config-files',
-        nargs='+',
-        required=True,
-        help='One or more configuration files (YAML format)'
-    )
-    parser.add_argument(
-        '--key',
-        required=True,
-        help='Configuration key to use'
-    )
-    parser.add_argument(
-        '--seq-lens',
-        nargs='+',
-        choices=list(seq_len_stoi.keys()),
-        required=False,
-        help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
-    )
-    parser.add_argument(
-        '--step-size',
-        type=int,
-        default=2,
-        help='Step size for concurrency values (default: 2)'
-    )
-    
-    args = parser.parse_args()
-    
-    # Convert seq-lens to set of (isl, osl) tuples for filtering
-    seq_lens_filter = None
-    if args.seq_lens:
-        seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens}
-    
-    # Load and merge all config files
-    all_config_data = {}
-    for config_file in args.config_files:
-        try:
-            with open(config_file, 'r') as f:
-                config_data = yaml.safe_load(f)
-                assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary"
-                
-                # Check for duplicate keys
-                duplicate_keys = set(all_config_data.keys()) & set(config_data.keys())
-                if duplicate_keys:
-                    raise ValueError(
-                        f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
-                    )
-                
-                all_config_data.update(config_data)
-        except FileNotFoundError:
-            raise ValueError(f"Input file '{config_file}' does not exist.")
-    
-    # Check if the key exists
-    if args.key not in all_config_data:
-        available_keys = ', '.join(sorted(all_config_data.keys()))
-        raise ValueError(
-            f"Key '{args.key}' not found in configuration files. "
-            f"Available keys: {available_keys}"
-        )
-    
-    val = all_config_data[args.key]
-    
-    # Validate required fields
-    seq_len_configs = val.get('seq-len-configs')
-    assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'"
-    
-    image = val.get('image')
-    model = val.get('model')
-    precision = val.get('precision')
-    framework = val.get('framework')
-    runner = val.get('runner')
-    
-    assert None not in (image, model, precision, framework, runner), \
-        f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'"
-    
-    matrix_values = []
-    
-    # Process each sequence length configuration
-    for seq_config in seq_len_configs:
-        isl = seq_config.get('isl')
-        osl = seq_config.get('osl')
-        
-        assert None not in (isl, osl), \
-            f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'"
-        
-        # Filter by sequence lengths if specified
-        if seq_lens_filter and (isl, osl) not in seq_lens_filter:
-            continue
-        
-        bmk_space = seq_config.get('bmk-space')
-        assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'"
-        
-        for bmk in bmk_space:
-            tp = bmk.get('tp')
-            conc_start = bmk.get('conc-start')
-            conc_end = bmk.get('conc-end')
-            ep = bmk.get('ep')
-            dp_attn = bmk.get('dp-attn')
-            
-            assert None not in (tp, conc_start, conc_end), \
-                f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'"
-            
-            # Generate entries for each concurrency value in the range
-            conc = conc_start
-            while conc <= conc_end:
-                entry = {
-                    'image': image,
-                    'model': model,
-                    'precision': precision,
-                    'framework': framework,
-                    'runner': runner,
-                    'isl': isl,
-                    'osl': osl,
-                    'tp': tp,
-                    'conc': conc,
-                    'max-model-len': isl + osl,
-                }
-                
-                # Add optional fields if they exist
-                if ep is not None:
-                    entry['ep'] = ep
-                if dp_attn is not None:
-                    entry['dp-attn'] = dp_attn
-                
-                matrix_values.append(entry)
-                
-                if conc == conc_end:
-                    break
-                conc *= args.step_size
-                if conc > conc_end:
-                    conc = conc_end
-    
-    print(json.dumps(matrix_values))
-    return matrix_values
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file

From bed5406a4e3b90fea8fcbf6450d41a593514012d Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 12:39:01 -0500
Subject: [PATCH 088/149] adding gb200

---
 .github/workflows/1k1k-sweep.yml              | 47 ++++++++++++++++++-
 .../workflows/benchmark-multinode-tmpl.yml    |  3 +-
 2 files changed, 48 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index cbdc490e2..3c1cd01ae 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -87,8 +87,53 @@ jobs:
             dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
+    # This is a workaround until we can integrate GB200 into master configs.
+    benchmark-gb200:
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: gptoss 1k1k
+        strategy:
+            fail-fast: false
+            matrix:
+                config:
+                    - {
+                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
+                          "model": "deepseek-r1-fp4",
+                          "model-prefix": "dsr1",
+                          "precision": "fp4",
+                          "framework": "dynamo-trtllm",
+                          "mtp": "off",
+                      }
+                    - {
+                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
+                          "model": "deepseek-r1-fp4",
+                          "model-prefix": "dsr1",
+                          "precision": "fp4",
+                          "framework": "dynamo-trtllm",
+                          "mtp": "on",
+                      }
+                    - {
+                          "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1",
+                          "model": "deepseek-ai/DeepSeek-R1-0528",
+                          "model-prefix": "dsr1",
+                          "precision": "fp8",
+                          "framework": "dynamo-sglang",
+                          "mtp": "off",
+                      }
+        secrets: inherit
+        with:
+            runner: gb200
+            image: ${{ matrix.config.runner }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            exp-name: ${{ matrix.config.model-prefix }}_1k1k
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            mtp-mode: ${{ matrix.config.mtp }}
+
     collect-dsr1-results:
-        needs: benchmark-dsr1
+        needs: [benchmark-dsr1, benchmark-gb200]
         if: ${{ always() }}
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 07f5b876d..0386e7d55 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -31,8 +31,9 @@ on:
         required: true
         type: string
       random-range-ratio:
-        required: true
+        required: false
         type: string
+        default: '0.8'
       mtp-mode:
         required: true
         type: string

From 475559a4d79dcb79affc51e79e80bdac1b9fafd1 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 12:40:40 -0500
Subject: [PATCH 089/149] adding gb200 pt 2

---
 .github/workflows/1k1k-sweep.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 3c1cd01ae..de26a1af0 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -90,7 +90,7 @@ jobs:
     # This is a workaround until we can integrate GB200 into master configs.
     benchmark-gb200:
         uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-        name: gptoss 1k1k
+        name: gb200 1k1k sweep
         strategy:
             fail-fast: false
             matrix:

From f24799bb8f2723dc7b1b92b3668e6feeb93be402 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 12:47:04 -0500
Subject: [PATCH 090/149] adding gb200 pt 3

---
 .github/workflows/1k1k-sweep.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index de26a1af0..699b0baff 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -122,7 +122,7 @@ jobs:
         secrets: inherit
         with:
             runner: gb200
-            image: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
             model: ${{ matrix.config.model }}
             framework: ${{ matrix.config.framework }}
             precision: ${{ matrix.config.precision }}

From 5f61cd3c75875357c056c5716340ff688cbc3662 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 12:50:56 -0500
Subject: [PATCH 091/149] adding gb200 to other isl osl sweeps

---
 .github/workflows/1k8k-sweep.yml | 45 ++++++++++++++++++++++++++++++++
 .github/workflows/8k1k-sweep.yml | 45 ++++++++++++++++++++++++++++++++
 2 files changed, 90 insertions(+)

diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index 25fc3a362..78d9b939b 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -85,6 +85,51 @@ jobs:
             dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
+    # This is a workaround until we can integrate GB200 into master configs.
+    benchmark-gb200:
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: gb200 1k1k sweep
+        strategy:
+            fail-fast: false
+            matrix:
+                config:
+                    - {
+                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
+                          "model": "deepseek-r1-fp4",
+                          "model-prefix": "dsr1",
+                          "precision": "fp4",
+                          "framework": "dynamo-trtllm",
+                          "mtp": "off",
+                      }
+                    - {
+                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
+                          "model": "deepseek-r1-fp4",
+                          "model-prefix": "dsr1",
+                          "precision": "fp4",
+                          "framework": "dynamo-trtllm",
+                          "mtp": "on",
+                      }
+                    - {
+                          "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1",
+                          "model": "deepseek-ai/DeepSeek-R1-0528",
+                          "model-prefix": "dsr1",
+                          "precision": "fp8",
+                          "framework": "dynamo-sglang",
+                          "mtp": "off",
+                      }
+        secrets: inherit
+        with:
+            runner: gb200
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            exp-name: ${{ matrix.config.model-prefix }}_1k8k
+            isl: 1024
+            osl: 8192
+            max-model-len: 2048
+            mtp-mode: ${{ matrix.config.mtp }}
+
     collect-dsr1-results:
         needs: benchmark-dsr1
         if: ${{ always() }}
diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml
index c8338d533..179b542ae 100644
--- a/.github/workflows/8k1k-sweep.yml
+++ b/.github/workflows/8k1k-sweep.yml
@@ -85,6 +85,51 @@ jobs:
             dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
+    # This is a workaround until we can integrate GB200 into master configs.
+    benchmark-gb200:
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: gb200 1k1k sweep
+        strategy:
+            fail-fast: false
+            matrix:
+                config:
+                    - {
+                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
+                          "model": "deepseek-r1-fp4",
+                          "model-prefix": "dsr1",
+                          "precision": "fp4",
+                          "framework": "dynamo-trtllm",
+                          "mtp": "off",
+                      }
+                    - {
+                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
+                          "model": "deepseek-r1-fp4",
+                          "model-prefix": "dsr1",
+                          "precision": "fp4",
+                          "framework": "dynamo-trtllm",
+                          "mtp": "on",
+                      }
+                    - {
+                          "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1",
+                          "model": "deepseek-ai/DeepSeek-R1-0528",
+                          "model-prefix": "dsr1",
+                          "precision": "fp8",
+                          "framework": "dynamo-sglang",
+                          "mtp": "off",
+                      }
+        secrets: inherit
+        with:
+            runner: gb200
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            exp-name: ${{ matrix.config.model-prefix }}_8k1k
+            isl: 8192
+            osl: 1024
+            max-model-len: 2048
+            mtp-mode: ${{ matrix.config.mtp }}
+
     collect-dsr1-results:
         needs: benchmark-dsr1
         if: ${{ always() }}

From 89ebc6e099c213d67013816b00a868b46ff2cb93 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 15:09:18 -0500
Subject: [PATCH 092/149] adding gb200 to other isl osl sweeps

---
 .github/workflows/gb200-tests.yml | 97 +++++++++++++++++++++++++++++++
 1 file changed, 97 insertions(+)
 create mode 100644 .github/workflows/gb200-tests.yml

diff --git a/.github/workflows/gb200-tests.yml b/.github/workflows/gb200-tests.yml
new file mode 100644
index 000000000..8cc4d775f
--- /dev/null
+++ b/.github/workflows/gb200-tests.yml
@@ -0,0 +1,97 @@
+name: End-to-End Tests
+
+# concurrency:
+#     group: benchmark-lock
+#     cancel-in-progress: false
+
+on:
+    push:
+        branches:
+        - initial-refactor
+    workflow_dispatch:
+        inputs:
+            image:
+                description: "Docker Image"
+                required: true
+                type: choice
+                options:
+                    - "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1"
+                    - "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3"
+
+            model:
+                description: "Model"
+                required: true
+                type: choice
+                options:
+                    - "deepseek-ai/DeepSeek-R1-0528"
+                    - "deepseek-r1-fp4"
+
+            precision:
+                description: "Precision"
+                required: true
+                type: choice
+                options:
+                    - "fp4"
+                    - "fp8"
+
+            framework:
+                description: "Framework"
+                required: true
+                type: choice
+                options:
+                    - "dynamo-trtllm"
+                    - "dynamo-sglang"
+
+            mtp:
+                description: "Mtp On/Off"
+                required: true
+                type: choice
+                options:
+                    - "on"
+                    - "off"
+
+            isl:
+                description: "ISL"
+                required: true
+                type: string
+
+            osl:
+                description: "OSL"
+                required: true
+                type: string
+
+jobs:
+    pre-run:
+        runs-on: ubuntu-latest
+        outputs:
+            max-model-len: ${{ steps.calc.outputs.max-model-len }}
+        steps:
+            - id: calc
+              shell: python
+              run: |
+                  import os
+                  import sys
+                  try:
+                      isl = int("${{ inputs.isl }}")
+                      osl = int("${{ inputs.osl }}")
+                  except ValueError:
+                      print("Error: ISL and OSL must be integers")
+                      sys.exit(1)
+                  with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
+                      f.write(f"max-model-len={isl + osl}\n")
+
+    benchmark-gb200:
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: gb200 test
+        secrets: inherit
+        with:
+            runner: gb200
+            image: ${{ inputs.image }}
+            model: ${{ inputs.model }}
+            framework: ${{ inputs.framework }}
+            precision: ${{ inputs.precision }}
+            exp-name: dsr1_1k1k
+            isl: ${{ inputs.isl }}
+            osl: ${{ inputs.osl }}
+            max-model-len: ${{ needs.pre-run.outputs.max-model-len }}
+            mtp-mode: ${{ inputs.mtp }}

From 04b614aafc88eca5b20ea2f6e8132210f65d7981 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 15:10:00 -0500
Subject: [PATCH 093/149] adding gb200 test

---
 .github/workflows/gb200-tests.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/.github/workflows/gb200-tests.yml b/.github/workflows/gb200-tests.yml
index 8cc4d775f..ff6936a56 100644
--- a/.github/workflows/gb200-tests.yml
+++ b/.github/workflows/gb200-tests.yml
@@ -5,9 +5,6 @@ name: End-to-End Tests
 #     cancel-in-progress: false
 
 on:
-    push:
-        branches:
-        - initial-refactor
     workflow_dispatch:
         inputs:
             image:
@@ -81,6 +78,7 @@ jobs:
                       f.write(f"max-model-len={isl + osl}\n")
 
     benchmark-gb200:
+        needs: pre-run
         uses: ./.github/workflows/benchmark-multinode-tmpl.yml
         name: gb200 test
         secrets: inherit

From ab052fdbf93c60de1405bae0076759820a47bb94 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 15:10:44 -0500
Subject: [PATCH 094/149] adding gb200 test

---
 .github/workflows/gb200-tests.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/gb200-tests.yml b/.github/workflows/gb200-tests.yml
index ff6936a56..9de931457 100644
--- a/.github/workflows/gb200-tests.yml
+++ b/.github/workflows/gb200-tests.yml
@@ -1,10 +1,13 @@
-name: End-to-End Tests
+name: GB200 Tests
 
 # concurrency:
 #     group: benchmark-lock
 #     cancel-in-progress: false
 
 on:
+    push:
+        branches: 
+        - initial-refactor
     workflow_dispatch:
         inputs:
             image:

From 6495caa484370dc9a23ca0196eb39f55cf7cbd9a Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 15:10:52 -0500
Subject: [PATCH 095/149] adding gb200 test

---
 .github/workflows/gb200-tests.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/gb200-tests.yml b/.github/workflows/gb200-tests.yml
index 9de931457..5fc7b6636 100644
--- a/.github/workflows/gb200-tests.yml
+++ b/.github/workflows/gb200-tests.yml
@@ -5,9 +5,6 @@ name: GB200 Tests
 #     cancel-in-progress: false
 
 on:
-    push:
-        branches: 
-        - initial-refactor
     workflow_dispatch:
         inputs:
             image:

From 589382d7eaf8b595a90351c15818fdb2b4c16302 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 17:15:29 -0500
Subject: [PATCH 096/149] adding full sweep test

---
 .github/workflows/full-sweep-test.yml        | 371 +++++++++++++++++++
 .github/workflows/gb200-tests.yml            |   6 +-
 utils/matrix-logic/generate_sweep_configs.py |  13 +
 3 files changed, 385 insertions(+), 5 deletions(-)
 create mode 100644 .github/workflows/full-sweep-test.yml

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
new file mode 100644
index 000000000..d5b1894b8
--- /dev/null
+++ b/.github/workflows/full-sweep-test.yml
@@ -0,0 +1,371 @@
+name: Test - Full Sweep
+
+concurrency:
+    group: benchmark-lock
+    cancel-in-progress: false
+
+on:
+    push:
+        branches:
+        - initial-refactor
+    workflow_dispatch:
+        inputs:
+            run_1k1k:
+                type: boolean
+                required: false
+            run_8k1k:
+                type: boolean
+                required: false
+            run_1k8k:
+                type: boolean
+                required: false
+
+            use_h100:
+                type: boolean
+                required: false
+            use_h200:
+                type: boolean
+                required: false
+            use_b200:
+                type: boolean
+                required: false
+            use_mi300x:
+                type: boolean
+                required: false
+            use_mi325x:
+                type: boolean
+                required: false
+            use_mi355x:
+                type: boolean
+                required: false
+            use_gb200:
+                type: boolean
+                required: false
+
+jobs:
+    get-configs:
+        runs-on: ubuntu-latest
+        outputs:
+            dsr1-1k1k: ${{ steps.generate-configs.outputs.dsr1-1k1k }}
+            dsr1-1k8k: ${{ steps.generate-configs.outputs.dsr1-1k8k }}
+            dsr1-8k1k: ${{ steps.generate-configs.outputs.dsr1-8k1k }}
+            gptoss-1k1k: ${{ steps.generate-configs.outputs.gptoss-1k1k }}
+            gptoss-1k8k: ${{ steps.generate-configs.outputs.gptoss-1k8k }}
+            gptoss-8k1k: ${{ steps.generate-configs.outputs.gptoss-8k1k }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - id: generate-configs
+              run: |
+                  pip install pydantic
+
+                  # Build runner type filter based on inputs
+                  RUNNER_TYPES=""
+                  if [ "${{ inputs.use_h100 }}" = "true" ]; then
+                      RUNNER_TYPES="$RUNNER_TYPES h100"
+                  fi
+                  if [ "${{ inputs.use_h200 }}" = "true" ]; then
+                      RUNNER_TYPES="$RUNNER_TYPES h200 h200-trt"
+                  fi
+                  if [ "${{ inputs.use_b200 }}" = "true" ]; then
+                      RUNNER_TYPES="$RUNNER_TYPES b200 b200-trt b200-nvs"
+                  fi
+                  if [ "${{ inputs.use_mi300x }}" = "true" ]; then
+                      RUNNER_TYPES="$RUNNER_TYPES mi300x"
+                  fi
+                  if [ "${{ inputs.use_mi325x }}" = "true" ]; then
+                      RUNNER_TYPES="$RUNNER_TYPES mi325x"
+                  fi
+                  if [ "${{ inputs.use_mi355x }}" = "true" ]; then
+                      RUNNER_TYPES="$RUNNER_TYPES mi355x"
+                  fi
+
+                  # Build runner filter argument if runner types specified
+                  RUNNER_FILTER=""
+                  if [ -n "$RUNNER_TYPES" ]; then
+                      RUNNER_FILTER="--runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml"
+                  fi
+
+                  # Generate dsr1 configs
+                  if [ "${{ inputs.run_1k1k }}" = "true" ]; then
+                      DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 $RUNNER_FILTER)
+                      echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT
+                  else
+                      echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  if [ "${{ inputs.run_1k8k }}" = "true" ]; then
+                      DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 $RUNNER_FILTER)
+                      echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT
+                  else
+                      echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  if [ "${{ inputs.run_8k1k }}" = "true" ]; then
+                      DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 $RUNNER_FILTER)
+                      echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT
+                  else
+                      echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  # Generate gptoss configs
+                  if [ "${{ inputs.run_1k1k }}" = "true" ]; then
+                      GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss $RUNNER_FILTER)
+                      echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT
+                  else
+                      echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  if [ "${{ inputs.run_1k8k }}" = "true" ]; then
+                      GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss $RUNNER_FILTER)
+                      echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT
+                  else
+                      echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  if [ "${{ inputs.run_8k1k }}" = "true" ]; then
+                      GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss $RUNNER_FILTER)
+                      echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT
+                  else
+                      echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+    # DSR1 1K1K Benchmarks
+    benchmark-dsr1-1k1k:
+        needs: get-configs
+        if: ${{ needs.get-configs.outputs.dsr1-1k1k != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k1k) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+
+    collect-dsr1-1k1k-results:
+        needs: benchmark-dsr1-1k1k
+        if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k1k"
+
+    # GPTOSS 1K1K Benchmarks
+    benchmark-gptoss-1k1k:
+        needs: get-configs
+        if: ${{ needs.get-configs.outputs.gptoss-1k1k != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k1k) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+
+    collect-gptoss-1k1k-results:
+        needs: benchmark-gptoss-1k1k
+        if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_1k1k"
+
+    # DSR1 8K1K Benchmarks
+    benchmark-dsr1-8k1k:
+        needs: get-configs
+        if: ${{ needs.get-configs.outputs.dsr1-8k1k != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-configs.outputs.dsr1-8k1k) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+
+    collect-dsr1-8k1k-results:
+        needs: benchmark-dsr1-8k1k
+        if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_8k1k"
+
+    # GPTOSS 8K1K Benchmarks
+    benchmark-gptoss-8k1k:
+        needs: get-configs
+        if: ${{ needs.get-configs.outputs.gptoss-8k1k != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-configs.outputs.gptoss-8k1k) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+
+    collect-gptoss-8k1k-results:
+        needs: benchmark-gptoss-8k1k
+        if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_8k1k"
+
+    # DSR1 1K8K Benchmarks
+    benchmark-dsr1-1k8k:
+        needs: get-configs
+        if: ${{ needs.get-configs.outputs.dsr1-1k8k != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k8k) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+
+    collect-dsr1-1k8k-results:
+        needs: benchmark-dsr1-1k8k
+        if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k8k"
+
+    # GPTOSS 1K8K Benchmarks
+    benchmark-gptoss-1k8k:
+        needs: get-configs
+        if: ${{ needs.get-configs.outputs.gptoss-1k8k != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k8k) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+
+    collect-gptoss-1k8k-results:
+        needs: benchmark-gptoss-1k8k
+        if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_1k8k"
+
+    calc-success-rate:
+        needs:
+            [
+                collect-dsr1-1k1k-results,
+                collect-dsr1-1k8k-results,
+                collect-dsr1-8k1k-results,
+                collect-gptoss-1k1k-results,
+                collect-gptoss-1k8k-results,
+                collect-gptoss-8k1k-results,
+            ]
+        if: ${{ always() }}
+        runs-on: ubuntu-latest
+
+        env:
+            RESULTS_DIR: "results/"
+            STATS_FILENAME: "run_stats"
+            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
+
+        steps:
+            - uses: actions/checkout@v3
+              with:
+                  token: ${{ secrets.REPO_PAT }}
+                  fetch-depth: 0
+
+            - name: Download results artifacts
+              uses: actions/download-artifact@v4
+              with:
+                  path: ${{ env.RESULTS_DIR }}
+                  pattern: results_*
+
+            - name: Install python dependencies
+              run: pip install PyGithub
+
+            - name: Calculate success rate
+              run: python3 utils/calc_success_rate.py $STATS_FILENAME
+
+            - uses: actions/upload-artifact@v4
+              with:
+                  name: "run-stats"
+                  path: ${{ env.STATS_FILENAME }}.json
diff --git a/.github/workflows/gb200-tests.yml b/.github/workflows/gb200-tests.yml
index 5fc7b6636..c700599d9 100644
--- a/.github/workflows/gb200-tests.yml
+++ b/.github/workflows/gb200-tests.yml
@@ -1,14 +1,10 @@
 name: GB200 Tests
 
-# concurrency:
-#     group: benchmark-lock
-#     cancel-in-progress: false
-
 on:
     workflow_dispatch:
         inputs:
             image:
-                description: "Docker Image"
+                description: "Serving Image"
                 required: true
                 type: choice
                 options:
diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index a768554e1..c43a1759e 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -551,11 +551,19 @@ def generate_runner_sweep_config(args, all_config_data):
         raise ValueError(
             f"Runner config file '{args.runner_config}' does not exist.")
 
+    if not runner_config.get(args.runner_type):
+        raise ValueError(
+            f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.")
+
+
     matrix_values = []
     for key, val in all_config_data.items():
         # Only consider configs with specified runner
         if not key.startswith(args.model_prefix):
             continue
+        
+        if not val['runner'] == args.runner_type:
+            continue
 
         # Optionally filter by precision and framework
         if (args.precision and val['precision'] != args.precision) or (args.framework and val['framework'] != args.framework):
@@ -807,6 +815,11 @@ def main():
         add_help=False,
         help='Given a model (and optionally a precision and framework), find all configurations matching the inputs, and run those configurations across all compatible runner nodes. This is meant to validate all runner nodes that should run a particular model can. For instance, this should be used to validate that all runners nodes that should run gptoss-120b actually do so successfully.'
     )
+    test_config_parser.add_argument(
+        '--runner-type',
+        required=True,
+        help='Runner type (e.g., h200-trt, h100)'
+    )
     test_config_parser.add_argument(
         '--model-prefix',
         required=True,

From b920ec4d48b201447295a64c3a61e71c86397072 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 17:15:53 -0500
Subject: [PATCH 097/149] adding full sweep test pt 2

---
 .github/workflows/full-sweep-test.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index d5b1894b8..f7fdf9fc2 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -5,9 +5,6 @@ concurrency:
     cancel-in-progress: false
 
 on:
-    push:
-        branches:
-        - initial-refactor
     workflow_dispatch:
         inputs:
             run_1k1k:

From d4c5dbc64871915c2ab0708c9001d208462b4f38 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 20:06:53 -0500
Subject: [PATCH 098/149] adding full sweep test pt 2

---
 .github/workflows/full-sweep-test.yml | 49 +++++++++++++++++++--------
 1 file changed, 35 insertions(+), 14 deletions(-)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index f7fdf9fc2..eb1abbd25 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -1,8 +1,8 @@
 name: Test - Full Sweep
 
-concurrency:
-    group: benchmark-lock
-    cancel-in-progress: false
+# concurrency:
+#     group: benchmark-lock
+#     cancel-in-progress: false
 
 on:
     workflow_dispatch:
@@ -77,30 +77,39 @@ jobs:
                   if [ "${{ inputs.use_mi355x }}" = "true" ]; then
                       RUNNER_TYPES="$RUNNER_TYPES mi355x"
                   fi
-
-                  # Build runner filter argument if runner types specified
-                  RUNNER_FILTER=""
-                  if [ -n "$RUNNER_TYPES" ]; then
-                      RUNNER_FILTER="--runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml"
+                  if [ "${{ inputs.use_gb200 }}" = "true" ]; then
+                      RUNNER_TYPES="$RUNNER_TYPES gb200"
                   fi
 
                   # Generate dsr1 configs
                   if [ "${{ inputs.run_1k1k }}" = "true" ]; then
-                      DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 $RUNNER_FILTER)
+                      if [ -n "$RUNNER_TYPES" ]; then
+                          DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      else
+                          DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1)
+                      fi
                       echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT
                   else
                       echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT
                   fi
 
                   if [ "${{ inputs.run_1k8k }}" = "true" ]; then
-                      DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 $RUNNER_FILTER)
+                      if [ -n "$RUNNER_TYPES" ]; then
+                          DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      else
+                          DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
+                      fi
                       echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT
                   else
                       echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT
                   fi
 
                   if [ "${{ inputs.run_8k1k }}" = "true" ]; then
-                      DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 $RUNNER_FILTER)
+                      if [ -n "$RUNNER_TYPES" ]; then
+                          DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      else
+                          DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1)
+                      fi
                       echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT
                   else
                       echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT
@@ -108,21 +117,33 @@ jobs:
 
                   # Generate gptoss configs
                   if [ "${{ inputs.run_1k1k }}" = "true" ]; then
-                      GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss $RUNNER_FILTER)
+                      if [ -n "$RUNNER_TYPES" ]; then
+                          GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      else
+                          GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
+                      fi
                       echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT
                   else
                       echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT
                   fi
 
                   if [ "${{ inputs.run_1k8k }}" = "true" ]; then
-                      GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss $RUNNER_FILTER)
+                      if [ -n "$RUNNER_TYPES" ]; then
+                          GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      else
+                          GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
+                      fi
                       echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT
                   else
                       echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT
                   fi
 
                   if [ "${{ inputs.run_8k1k }}" = "true" ]; then
-                      GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss $RUNNER_FILTER)
+                      if [ -n "$RUNNER_TYPES" ]; then
+                          GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      else
+                          GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss)
+                      fi
                       echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT
                   else
                       echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT

From 02deb3dae328104abe34dbce90ef321aabc29c8a Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 20:11:19 -0500
Subject: [PATCH 099/149] adding full sweep test pt 2

---
 .github/workflows/full-sweep-test.yml | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index eb1abbd25..bb5246a71 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -60,27 +60,30 @@ jobs:
                   # Build runner type filter based on inputs
                   RUNNER_TYPES=""
                   if [ "${{ inputs.use_h100 }}" = "true" ]; then
-                      RUNNER_TYPES="$RUNNER_TYPES h100"
+                      RUNNER_TYPES="${RUNNER_TYPES} h100"
                   fi
                   if [ "${{ inputs.use_h200 }}" = "true" ]; then
-                      RUNNER_TYPES="$RUNNER_TYPES h200 h200-trt"
+                      RUNNER_TYPES="${RUNNER_TYPES} h200 h200-trt"
                   fi
                   if [ "${{ inputs.use_b200 }}" = "true" ]; then
-                      RUNNER_TYPES="$RUNNER_TYPES b200 b200-trt b200-nvs"
+                      RUNNER_TYPES="${RUNNER_TYPES} b200 b200-trt b200-nvs"
                   fi
                   if [ "${{ inputs.use_mi300x }}" = "true" ]; then
-                      RUNNER_TYPES="$RUNNER_TYPES mi300x"
+                      RUNNER_TYPES="${RUNNER_TYPES} mi300x"
                   fi
                   if [ "${{ inputs.use_mi325x }}" = "true" ]; then
-                      RUNNER_TYPES="$RUNNER_TYPES mi325x"
+                      RUNNER_TYPES="${RUNNER_TYPES} mi325x"
                   fi
                   if [ "${{ inputs.use_mi355x }}" = "true" ]; then
-                      RUNNER_TYPES="$RUNNER_TYPES mi355x"
+                      RUNNER_TYPES="${RUNNER_TYPES} mi355x"
                   fi
                   if [ "${{ inputs.use_gb200 }}" = "true" ]; then
-                      RUNNER_TYPES="$RUNNER_TYPES gb200"
+                      RUNNER_TYPES="${RUNNER_TYPES} gb200"
                   fi
 
+                  # Trim leading whitespace
+                  RUNNER_TYPES=$(echo $RUNNER_TYPES | xargs)
+
                   # Generate dsr1 configs
                   if [ "${{ inputs.run_1k1k }}" = "true" ]; then
                       if [ -n "$RUNNER_TYPES" ]; then

From 18c26b3a0af0dc6f76561ec594720449dfe5a271 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 20:11:58 -0500
Subject: [PATCH 100/149] adding full sweep test pt 2

---
 .github/workflows/full-sweep-test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index bb5246a71..a89f61421 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -56,6 +56,7 @@ jobs:
             - id: generate-configs
               run: |
                   pip install pydantic
+                  set -x
 
                   # Build runner type filter based on inputs
                   RUNNER_TYPES=""

From f1477e53b1e1301f070c1f2aa5ddd4d57c732cc9 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 20:16:09 -0500
Subject: [PATCH 101/149] adding full sweep test pt 2

---
 .github/workflows/full-sweep-test.yml | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index a89f61421..e81a71f42 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -85,10 +85,13 @@ jobs:
                   # Trim leading whitespace
                   RUNNER_TYPES=$(echo $RUNNER_TYPES | xargs)
 
+                  # DSR1 does not run on h100, so filter it out for dsr1 configs
+                  DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs)
+
                   # Generate dsr1 configs
                   if [ "${{ inputs.run_1k1k }}" = "true" ]; then
-                      if [ -n "$RUNNER_TYPES" ]; then
-                          DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      if [ -n "$DSR1_RUNNER_TYPES" ]; then
+                          DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
                       else
                           DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1)
                       fi
@@ -98,8 +101,8 @@ jobs:
                   fi
 
                   if [ "${{ inputs.run_1k8k }}" = "true" ]; then
-                      if [ -n "$RUNNER_TYPES" ]; then
-                          DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      if [ -n "$DSR1_RUNNER_TYPES" ]; then
+                          DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
                       else
                           DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
                       fi
@@ -109,8 +112,8 @@ jobs:
                   fi
 
                   if [ "${{ inputs.run_8k1k }}" = "true" ]; then
-                      if [ -n "$RUNNER_TYPES" ]; then
-                          DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      if [ -n "$DSR1_RUNNER_TYPES" ]; then
+                          DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
                       else
                           DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1)
                       fi

From d64d907e621cf3c4097ab1c73a2c73c8ecb35c62 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 20:16:27 -0500
Subject: [PATCH 102/149] adding full sweep test pt 2

---
 .github/workflows/full-sweep-test.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index e81a71f42..8edbf8712 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -56,7 +56,6 @@ jobs:
             - id: generate-configs
               run: |
                   pip install pydantic
-                  set -x
 
                   # Build runner type filter based on inputs
                   RUNNER_TYPES=""

From d6bf37e3e5ded92de33c67b02b65dcf1c338351a Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 20:24:16 -0500
Subject: [PATCH 103/149] adding full sweep test pt 2

---
 .github/workflows/full-sweep-test.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index 8edbf8712..73a230beb 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -16,7 +16,6 @@ on:
             run_1k8k:
                 type: boolean
                 required: false
-
             use_h100:
                 type: boolean
                 required: false
@@ -56,7 +55,8 @@ jobs:
             - id: generate-configs
               run: |
                   pip install pydantic
-
+                  
+                  set -x 
                   # Build runner type filter based on inputs
                   RUNNER_TYPES=""
                   if [ "${{ inputs.use_h100 }}" = "true" ]; then

From dba3b4cd04704f0a28304b6fb6c666e8f0e8fb8a Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 20:35:17 -0500
Subject: [PATCH 104/149] adding full sweep test pt 2

---
 .github/workflows/full-sweep-test.yml | 126 ++++++++++++++------------
 1 file changed, 69 insertions(+), 57 deletions(-)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index 73a230beb..9abfe4118 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -55,10 +55,10 @@ jobs:
             - id: generate-configs
               run: |
                   pip install pydantic
-                  
-                  set -x 
-                  # Build runner type filter based on inputs
+
+                  # Build runner type filters based on inputs
                   RUNNER_TYPES=""
+
                   if [ "${{ inputs.use_h100 }}" = "true" ]; then
                       RUNNER_TYPES="${RUNNER_TYPES} h100"
                   fi
@@ -84,76 +84,88 @@ jobs:
                   # Trim leading whitespace
                   RUNNER_TYPES=$(echo $RUNNER_TYPES | xargs)
 
-                  # DSR1 does not run on h100, so filter it out for dsr1 configs
+                  # DSR1 doesn't support H100, so exclude it
                   DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs)
 
                   # Generate dsr1 configs
                   if [ "${{ inputs.run_1k1k }}" = "true" ]; then
                       if [ -n "$DSR1_RUNNER_TYPES" ]; then
                           DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                          echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT
                       else
-                          DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1)
+                          echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT
                       fi
-                      echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT
                   else
                       echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT
                   fi
 
-                  if [ "${{ inputs.run_1k8k }}" = "true" ]; then
-                      if [ -n "$DSR1_RUNNER_TYPES" ]; then
-                          DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                      else
-                          DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
-                      fi
-                      echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT
-                  else
-                      echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT
-                  fi
+                    # Generate dsr1 configs
+                    if [ "${{ inputs.run_1k1k }}" = "true" ]; then
+                        if [ -n "$DSR1_RUNNER_TYPES" ]; then
+                            DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                        else
+                            DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1)
+                        fi
+                        echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT
+                    else
+                        echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT
+                    fi
 
-                  if [ "${{ inputs.run_8k1k }}" = "true" ]; then
-                      if [ -n "$DSR1_RUNNER_TYPES" ]; then
-                          DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                      else
-                          DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1)
-                      fi
-                      echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT
-                  else
-                      echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT
-                  fi
+                    if [ "${{ inputs.run_1k8k }}" = "true" ]; then
+                        if [ -n "$DSR1_RUNNER_TYPES" ]; then
+                            DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                        else
+                            DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
+                        fi
+                        echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT
+                    else
+                        echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT
+                    fi
 
-                  # Generate gptoss configs
-                  if [ "${{ inputs.run_1k1k }}" = "true" ]; then
-                      if [ -n "$RUNNER_TYPES" ]; then
-                          GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                      else
-                          GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
-                      fi
-                      echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT
-                  else
-                      echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT
-                  fi
+                    if [ "${{ inputs.run_8k1k }}" = "true" ]; then
+                        if [ -n "$DSR1_RUNNER_TYPES" ]; then
+                            DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                        else
+                            DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1)
+                        fi
+                        echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT
+                    else
+                        echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT
+                    fi
 
-                  if [ "${{ inputs.run_1k8k }}" = "true" ]; then
-                      if [ -n "$RUNNER_TYPES" ]; then
-                          GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                      else
-                          GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
-                      fi
-                      echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT
-                  else
-                      echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT
-                  fi
+                    # Generate gptoss configs
+                    if [ "${{ inputs.run_1k1k }}" = "true" ]; then
+                        if [ -n "$RUNNER_TYPES" ]; then
+                            GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                        else
+                            GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
+                        fi
+                        echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT
+                    else
+                        echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT
+                    fi
 
-                  if [ "${{ inputs.run_8k1k }}" = "true" ]; then
-                      if [ -n "$RUNNER_TYPES" ]; then
-                          GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                      else
-                          GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss)
-                      fi
-                      echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT
-                  else
-                      echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT
-                  fi
+                    if [ "${{ inputs.run_1k8k }}" = "true" ]; then
+                        if [ -n "$RUNNER_TYPES" ]; then
+                            GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                        else
+                            GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
+                        fi
+                        echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT
+                    else
+                        echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT
+                    fi
+
+                    if [ "${{ inputs.run_8k1k }}" = "true" ]; then
+                        if [ -n "$RUNNER_TYPES" ]; then
+                            GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                        else
+                            GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss)
+                        fi
+                        echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT
+                    else
+                        echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT
+                    fi
 
     # DSR1 1K1K Benchmarks
     benchmark-dsr1-1k1k:

From a45e4bf7df1e1e4a9bec45dc458b903b7a2e5834 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 20:36:12 -0500
Subject: [PATCH 105/149] adding full sweep test pt 2

---
 .github/workflows/full-sweep-test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index 9abfe4118..759980b44 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -56,6 +56,7 @@ jobs:
               run: |
                   pip install pydantic
 
+                  set -x
                   # Build runner type filters based on inputs
                   RUNNER_TYPES=""
 

From 60233aa37c21deb54c7acf3ca5185fbdf55a920b Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 20:38:49 -0500
Subject: [PATCH 106/149] adding full sweep test pt 2

---
 .github/workflows/full-sweep-test.yml | 115 +++++++++-----------------
 1 file changed, 39 insertions(+), 76 deletions(-)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index 759980b44..4de3595ab 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -56,7 +56,6 @@ jobs:
               run: |
                   pip install pydantic
 
-                  set -x
                   # Build runner type filters based on inputs
                   RUNNER_TYPES=""
 
@@ -88,85 +87,49 @@ jobs:
                   # DSR1 doesn't support H100, so exclude it
                   DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs)
 
-                  # Generate dsr1 configs
-                  if [ "${{ inputs.run_1k1k }}" = "true" ]; then
-                      if [ -n "$DSR1_RUNNER_TYPES" ]; then
-                          DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                          echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT
-                      else
-                          echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT
-                      fi
+                  # Generate dsr1 configs (only if we have valid runner types for DSR1)
+                  if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
+                      DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT
                   else
                       echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT
                   fi
 
-                    # Generate dsr1 configs
-                    if [ "${{ inputs.run_1k1k }}" = "true" ]; then
-                        if [ -n "$DSR1_RUNNER_TYPES" ]; then
-                            DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                        else
-                            DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1)
-                        fi
-                        echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT
-                    else
-                        echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT
-                    fi
-
-                    if [ "${{ inputs.run_1k8k }}" = "true" ]; then
-                        if [ -n "$DSR1_RUNNER_TYPES" ]; then
-                            DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                        else
-                            DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
-                        fi
-                        echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT
-                    else
-                        echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT
-                    fi
-
-                    if [ "${{ inputs.run_8k1k }}" = "true" ]; then
-                        if [ -n "$DSR1_RUNNER_TYPES" ]; then
-                            DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                        else
-                            DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1)
-                        fi
-                        echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT
-                    else
-                        echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT
-                    fi
-
-                    # Generate gptoss configs
-                    if [ "${{ inputs.run_1k1k }}" = "true" ]; then
-                        if [ -n "$RUNNER_TYPES" ]; then
-                            GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                        else
-                            GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss)
-                        fi
-                        echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT
-                    else
-                        echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT
-                    fi
-
-                    if [ "${{ inputs.run_1k8k }}" = "true" ]; then
-                        if [ -n "$RUNNER_TYPES" ]; then
-                            GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                        else
-                            GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
-                        fi
-                        echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT
-                    else
-                        echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT
-                    fi
-
-                    if [ "${{ inputs.run_8k1k }}" = "true" ]; then
-                        if [ -n "$RUNNER_TYPES" ]; then
-                            GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                        else
-                            GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss)
-                        fi
-                        echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT
-                    else
-                        echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT
-                    fi
+                  if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
+                      DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT
+                  else
+                      echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
+                      DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT
+                  else
+                      echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  # Generate gptoss configs (only if we have runner types selected)
+                  if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
+                      GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT
+                  else
+                      echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
+                      GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT
+                  else
+                      echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
+                      GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT
+                  else
+                      echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT
+                  fi
 
     # DSR1 1K1K Benchmarks
     benchmark-dsr1-1k1k:

From c1b5ddd77ba1ab3f5f07592652eb880a19907b2d Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 20:42:49 -0500
Subject: [PATCH 107/149] adding full sweep test pt 2

---
 .github/workflows/full-sweep-test.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index 4de3595ab..645af1cdb 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -55,7 +55,8 @@ jobs:
             - id: generate-configs
               run: |
                   pip install pydantic
-
+                  
+                  set -x
                   # Build runner type filters based on inputs
                   RUNNER_TYPES=""
 

From 2cd02954309bbc837085d6d2cd80cbc6963090c5 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 20:51:51 -0500
Subject: [PATCH 108/149] adding full sweep test pt 2

---
 .github/workflows/1k8k-sweep.yml      |  4 +-
 .github/workflows/8k1k-sweep.yml      |  4 +-
 .github/workflows/full-sweep-test.yml | 92 ++++++++++++++++++++++++++-
 3 files changed, 94 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index 78d9b939b..837033312 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -127,11 +127,11 @@ jobs:
             exp-name: ${{ matrix.config.model-prefix }}_1k8k
             isl: 1024
             osl: 8192
-            max-model-len: 2048
+            max-model-len: 9216
             mtp-mode: ${{ matrix.config.mtp }}
 
     collect-dsr1-results:
-        needs: benchmark-dsr1
+        needs: [benchmark-dsr1, benchmark-gb200]
         if: ${{ always() }}
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit
diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml
index 179b542ae..bc7e51e30 100644
--- a/.github/workflows/8k1k-sweep.yml
+++ b/.github/workflows/8k1k-sweep.yml
@@ -127,11 +127,11 @@ jobs:
             exp-name: ${{ matrix.config.model-prefix }}_8k1k
             isl: 8192
             osl: 1024
-            max-model-len: 2048
+            max-model-len: 9216
             mtp-mode: ${{ matrix.config.mtp }}
 
     collect-dsr1-results:
-        needs: benchmark-dsr1
+        needs: [benchmark-dsr1, benchmark-gb200]
         if: ${{ always() }}
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit
diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index 645af1cdb..d5340dc60 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -55,7 +55,7 @@ jobs:
             - id: generate-configs
               run: |
                   pip install pydantic
-                  
+
                   set -x
                   # Build runner type filters based on inputs
                   RUNNER_TYPES=""
@@ -289,8 +289,96 @@ jobs:
             dp-attn: ${{ matrix.config.dp-attn }}
             conc: ${{ matrix.config.conc }}
 
+    # This is a workaround until we can integrate GB200 into master configs.
+    benchmark-gb200-1k1k:
+        if: ${{ inputs.use_gb200 && inputs.run_1k1k }}
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: gb200 1k1k sweep
+        strategy:
+            fail-fast: false
+            matrix:
+                config: &dsr1_static_configs
+                    - {
+                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
+                          "model": "deepseek-r1-fp4",
+                          "model-prefix": "dsr1",
+                          "precision": "fp4",
+                          "framework": "dynamo-trtllm",
+                          "mtp": "off",
+                      }
+                    - {
+                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
+                          "model": "deepseek-r1-fp4",
+                          "model-prefix": "dsr1",
+                          "precision": "fp4",
+                          "framework": "dynamo-trtllm",
+                          "mtp": "on",
+                      }
+                    - {
+                          "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1",
+                          "model": "deepseek-ai/DeepSeek-R1-0528",
+                          "model-prefix": "dsr1",
+                          "precision": "fp8",
+                          "framework": "dynamo-sglang",
+                          "mtp": "off",
+                      }
+        secrets: inherit
+        with:
+            runner: gb200
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            exp-name: ${{ matrix.config.model-prefix }}_1k1k
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            mtp-mode: ${{ matrix.config.mtp }}
+
+    benchmark-gb200-1k8k:
+        if: ${{ inputs.use_gb200 && inputs.run_1k8k }}
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: gb200 1k1k sweep
+        strategy:
+            fail-fast: false
+            matrix:
+                config: *dsr1_static_configs
+        secrets: inherit
+        with:
+            runner: gb200
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            exp-name: ${{ matrix.config.model-prefix }}_1k8k
+            isl: 1024
+            osl: 8192
+            max-model-len: 9216
+            mtp-mode: ${{ matrix.config.mtp }}
+
+    benchmark-gb200-8k1k:
+        if: ${{ inputs.use_gb200 && inputs.run_8k1k }}
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: gb200 1k1k sweep
+        strategy:
+            fail-fast: false
+            matrix:
+                config: *dsr1_static_configs
+        secrets: inherit
+        with:
+            runner: gb200
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            exp-name: ${{ matrix.config.model-prefix }}_8k1k
+            isl: 1024
+            osl: 8192
+            max-model-len: 9216
+            mtp-mode: ${{ matrix.config.mtp }}
+
     collect-dsr1-1k8k-results:
-        needs: benchmark-dsr1-1k8k
+        needs: [benchmark-dsr1-1k8k, benchmark-gb200-1k1k, benchmark-gb200-1k8k, benchmark-gb200-8k1k]
         if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }}
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit

From 3065c13acdb77f92066192741fcaeeaac242ba03 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 21:45:07 -0500
Subject: [PATCH 109/149] reverting title

---
 .github/workflows/1k1k-sweep.yml | 2 +-
 .github/workflows/1k8k-sweep.yml | 2 +-
 .github/workflows/8k1k-sweep.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 699b0baff..bc5305460 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -1,4 +1,4 @@
-name: "1K/1K Sweep"
+name: "Full Sweep Scheduler - 1k1k"
 
 concurrency:
     group: benchmark-lock-1k1k
diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index 837033312..da4d1daf3 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -1,4 +1,4 @@
-name: "1K/8K Sweep"
+name: "Full Sweep Scheduler - 1k8k"
 
 concurrency:
   group: benchmark-lock-1k8k
diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml
index bc7e51e30..fa3249da7 100644
--- a/.github/workflows/8k1k-sweep.yml
+++ b/.github/workflows/8k1k-sweep.yml
@@ -1,4 +1,4 @@
-name: "8K/1K Sweep"
+name: "Full Sweep Scheduler - 8k1k"
 
 concurrency:
   group: benchmark-lock-8k1k

From 89d6dc3d08f2b18cb9c625cdcc671748665c6789 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 22:05:28 -0500
Subject: [PATCH 110/149] adding full sweep test pt 2

---
 .github/workflows/full-sweep-test-orig.yml | 460 +++++++++++++++++++++
 .github/workflows/full-sweep-test.yml      |  35 +-
 2 files changed, 468 insertions(+), 27 deletions(-)
 create mode 100644 .github/workflows/full-sweep-test-orig.yml

diff --git a/.github/workflows/full-sweep-test-orig.yml b/.github/workflows/full-sweep-test-orig.yml
new file mode 100644
index 000000000..d5340dc60
--- /dev/null
+++ b/.github/workflows/full-sweep-test-orig.yml
@@ -0,0 +1,460 @@
+name: Test - Full Sweep
+
+# concurrency:
+#     group: benchmark-lock
+#     cancel-in-progress: false
+
+on:
+    workflow_dispatch:
+        inputs:
+            run_1k1k:
+                type: boolean
+                required: false
+            run_8k1k:
+                type: boolean
+                required: false
+            run_1k8k:
+                type: boolean
+                required: false
+            use_h100:
+                type: boolean
+                required: false
+            use_h200:
+                type: boolean
+                required: false
+            use_b200:
+                type: boolean
+                required: false
+            use_mi300x:
+                type: boolean
+                required: false
+            use_mi325x:
+                type: boolean
+                required: false
+            use_mi355x:
+                type: boolean
+                required: false
+            use_gb200:
+                type: boolean
+                required: false
+
+jobs:
+    get-configs:
+        runs-on: ubuntu-latest
+        outputs:
+            dsr1-1k1k: ${{ steps.generate-configs.outputs.dsr1-1k1k }}
+            dsr1-1k8k: ${{ steps.generate-configs.outputs.dsr1-1k8k }}
+            dsr1-8k1k: ${{ steps.generate-configs.outputs.dsr1-8k1k }}
+            gptoss-1k1k: ${{ steps.generate-configs.outputs.gptoss-1k1k }}
+            gptoss-1k8k: ${{ steps.generate-configs.outputs.gptoss-1k8k }}
+            gptoss-8k1k: ${{ steps.generate-configs.outputs.gptoss-8k1k }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - id: generate-configs
+              run: |
+                  pip install pydantic
+
+                  set -x
+                  # Build runner type filters based on inputs
+                  RUNNER_TYPES=""
+
+                  if [ "${{ inputs.use_h100 }}" = "true" ]; then
+                      RUNNER_TYPES="${RUNNER_TYPES} h100"
+                  fi
+                  if [ "${{ inputs.use_h200 }}" = "true" ]; then
+                      RUNNER_TYPES="${RUNNER_TYPES} h200 h200-trt"
+                  fi
+                  if [ "${{ inputs.use_b200 }}" = "true" ]; then
+                      RUNNER_TYPES="${RUNNER_TYPES} b200 b200-trt b200-nvs"
+                  fi
+                  if [ "${{ inputs.use_mi300x }}" = "true" ]; then
+                      RUNNER_TYPES="${RUNNER_TYPES} mi300x"
+                  fi
+                  if [ "${{ inputs.use_mi325x }}" = "true" ]; then
+                      RUNNER_TYPES="${RUNNER_TYPES} mi325x"
+                  fi
+                  if [ "${{ inputs.use_mi355x }}" = "true" ]; then
+                      RUNNER_TYPES="${RUNNER_TYPES} mi355x"
+                  fi
+                  if [ "${{ inputs.use_gb200 }}" = "true" ]; then
+                      RUNNER_TYPES="${RUNNER_TYPES} gb200"
+                  fi
+
+                  # Trim leading whitespace
+                  RUNNER_TYPES=$(echo $RUNNER_TYPES | xargs)
+
+                  # DSR1 doesn't support H100, so exclude it
+                  DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs)
+
+                  # Generate dsr1 configs (only if we have valid runner types for DSR1)
+                  if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
+                      DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT
+                  else
+                      echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
+                      DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT
+                  else
+                      echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
+                      DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT
+                  else
+                      echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  # Generate gptoss configs (only if we have runner types selected)
+                  if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
+                      GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT
+                  else
+                      echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
+                      GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT
+                  else
+                      echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+                  if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
+                      GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
+                      echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT
+                  else
+                      echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT
+                  fi
+
+    # DSR1 1K1K Benchmarks
+    benchmark-dsr1-1k1k:
+        needs: get-configs
+        if: ${{ needs.get-configs.outputs.dsr1-1k1k != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k1k) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+
+    collect-dsr1-1k1k-results:
+        needs: benchmark-dsr1-1k1k
+        if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k1k"
+
+    # GPTOSS 1K1K Benchmarks
+    benchmark-gptoss-1k1k:
+        needs: get-configs
+        if: ${{ needs.get-configs.outputs.gptoss-1k1k != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k1k) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+
+    collect-gptoss-1k1k-results:
+        needs: benchmark-gptoss-1k1k
+        if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_1k1k"
+
+    # DSR1 8K1K Benchmarks
+    benchmark-dsr1-8k1k:
+        needs: get-configs
+        if: ${{ needs.get-configs.outputs.dsr1-8k1k != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-configs.outputs.dsr1-8k1k) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+
+    collect-dsr1-8k1k-results:
+        needs: benchmark-dsr1-8k1k
+        if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_8k1k"
+
+    # GPTOSS 8K1K Benchmarks
+    benchmark-gptoss-8k1k:
+        needs: get-configs
+        if: ${{ needs.get-configs.outputs.gptoss-8k1k != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-configs.outputs.gptoss-8k1k) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+
+    collect-gptoss-8k1k-results:
+        needs: benchmark-gptoss-8k1k
+        if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_8k1k"
+
+    # DSR1 1K8K Benchmarks
+    benchmark-dsr1-1k8k:
+        needs: get-configs
+        if: ${{ needs.get-configs.outputs.dsr1-1k8k != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k8k) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+
+    # This is a workaround until we can integrate GB200 into master configs.
+    benchmark-gb200-1k1k:
+        if: ${{ inputs.use_gb200 && inputs.run_1k1k }}
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: gb200 1k1k sweep
+        strategy:
+            fail-fast: false
+            matrix:
+                config: &dsr1_static_configs
+                    - {
+                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
+                          "model": "deepseek-r1-fp4",
+                          "model-prefix": "dsr1",
+                          "precision": "fp4",
+                          "framework": "dynamo-trtllm",
+                          "mtp": "off",
+                      }
+                    - {
+                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
+                          "model": "deepseek-r1-fp4",
+                          "model-prefix": "dsr1",
+                          "precision": "fp4",
+                          "framework": "dynamo-trtllm",
+                          "mtp": "on",
+                      }
+                    - {
+                          "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1",
+                          "model": "deepseek-ai/DeepSeek-R1-0528",
+                          "model-prefix": "dsr1",
+                          "precision": "fp8",
+                          "framework": "dynamo-sglang",
+                          "mtp": "off",
+                      }
+        secrets: inherit
+        with:
+            runner: gb200
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            exp-name: ${{ matrix.config.model-prefix }}_1k1k
+            isl: 1024
+            osl: 1024
+            max-model-len: 2048
+            mtp-mode: ${{ matrix.config.mtp }}
+
+    benchmark-gb200-1k8k:
+        if: ${{ inputs.use_gb200 && inputs.run_1k8k }}
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: gb200 1k1k sweep
+        strategy:
+            fail-fast: false
+            matrix:
+                config: *dsr1_static_configs
+        secrets: inherit
+        with:
+            runner: gb200
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            exp-name: ${{ matrix.config.model-prefix }}_1k8k
+            isl: 1024
+            osl: 8192
+            max-model-len: 9216
+            mtp-mode: ${{ matrix.config.mtp }}
+
+    benchmark-gb200-8k1k:
+        if: ${{ inputs.use_gb200 && inputs.run_8k1k }}
+        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
+        name: gb200 1k1k sweep
+        strategy:
+            fail-fast: false
+            matrix:
+                config: *dsr1_static_configs
+        secrets: inherit
+        with:
+            runner: gb200
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            exp-name: ${{ matrix.config.model-prefix }}_8k1k
+            isl: 1024
+            osl: 8192
+            max-model-len: 9216
+            mtp-mode: ${{ matrix.config.mtp }}
+
+    collect-dsr1-1k8k-results:
+        needs: [benchmark-dsr1-1k8k, benchmark-gb200-1k1k, benchmark-gb200-1k8k, benchmark-gb200-8k1k]
+        if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k8k"
+
+    # GPTOSS 1K8K Benchmarks
+    benchmark-gptoss-1k8k:
+        needs: get-configs
+        if: ${{ needs.get-configs.outputs.gptoss-1k8k != '[]' }}
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k8k) }}
+        secrets: inherit
+        with:
+            exp-name: ${{ matrix.config.exp-name }}
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep }}
+            dp-attn: ${{ matrix.config.dp-attn }}
+            conc: ${{ matrix.config.conc }}
+
+    collect-gptoss-1k8k-results:
+        needs: benchmark-gptoss-1k8k
+        if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }}
+        uses: ./.github/workflows/collect-results.yml
+        secrets: inherit
+        with:
+            exp-name: "gptoss_1k8k"
+
+    calc-success-rate:
+        needs:
+            [
+                collect-dsr1-1k1k-results,
+                collect-dsr1-1k8k-results,
+                collect-dsr1-8k1k-results,
+                collect-gptoss-1k1k-results,
+                collect-gptoss-1k8k-results,
+                collect-gptoss-8k1k-results,
+            ]
+        if: ${{ always() }}
+        runs-on: ubuntu-latest
+
+        env:
+            RESULTS_DIR: "results/"
+            STATS_FILENAME: "run_stats"
+            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
+
+        steps:
+            - uses: actions/checkout@v3
+              with:
+                  token: ${{ secrets.REPO_PAT }}
+                  fetch-depth: 0
+
+            - name: Download results artifacts
+              uses: actions/download-artifact@v4
+              with:
+                  path: ${{ env.RESULTS_DIR }}
+                  pattern: results_*
+
+            - name: Install python dependencies
+              run: pip install PyGithub
+
+            - name: Calculate success rate
+              run: python3 utils/calc_success_rate.py $STATS_FILENAME
+
+            - uses: actions/upload-artifact@v4
+              with:
+                  name: "run-stats"
+                  path: ${{ env.STATS_FILENAME }}.json
diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index d5340dc60..0787b5c2a 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -58,32 +58,7 @@ jobs:
 
                   set -x
                   # Build runner type filters based on inputs
-                  RUNNER_TYPES=""
-
-                  if [ "${{ inputs.use_h100 }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} h100"
-                  fi
-                  if [ "${{ inputs.use_h200 }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} h200 h200-trt"
-                  fi
-                  if [ "${{ inputs.use_b200 }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} b200 b200-trt b200-nvs"
-                  fi
-                  if [ "${{ inputs.use_mi300x }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} mi300x"
-                  fi
-                  if [ "${{ inputs.use_mi325x }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} mi325x"
-                  fi
-                  if [ "${{ inputs.use_mi355x }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} mi355x"
-                  fi
-                  if [ "${{ inputs.use_gb200 }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} gb200"
-                  fi
-
-                  # Trim leading whitespace
-                  RUNNER_TYPES=$(echo $RUNNER_TYPES | xargs)
+                  RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200 h200-trt' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}"
 
                   # DSR1 doesn't support H100, so exclude it
                   DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs)
@@ -378,7 +353,13 @@ jobs:
             mtp-mode: ${{ matrix.config.mtp }}
 
     collect-dsr1-1k8k-results:
-        needs: [benchmark-dsr1-1k8k, benchmark-gb200-1k1k, benchmark-gb200-1k8k, benchmark-gb200-8k1k]
+        needs:
+            [
+                benchmark-dsr1-1k8k,
+                benchmark-gb200-1k1k,
+                benchmark-gb200-1k8k,
+                benchmark-gb200-8k1k,
+            ]
         if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }}
         uses: ./.github/workflows/collect-results.yml
         secrets: inherit

From 68e24620519fe9c59caeb46bea3ec7810525bf4d Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 22:07:39 -0500
Subject: [PATCH 111/149] adding full sweep test pt 2

---
 .github/workflows/full-sweep-test-orig.yml | 460 ---------------------
 1 file changed, 460 deletions(-)
 delete mode 100644 .github/workflows/full-sweep-test-orig.yml

diff --git a/.github/workflows/full-sweep-test-orig.yml b/.github/workflows/full-sweep-test-orig.yml
deleted file mode 100644
index d5340dc60..000000000
--- a/.github/workflows/full-sweep-test-orig.yml
+++ /dev/null
@@ -1,460 +0,0 @@
-name: Test - Full Sweep
-
-# concurrency:
-#     group: benchmark-lock
-#     cancel-in-progress: false
-
-on:
-    workflow_dispatch:
-        inputs:
-            run_1k1k:
-                type: boolean
-                required: false
-            run_8k1k:
-                type: boolean
-                required: false
-            run_1k8k:
-                type: boolean
-                required: false
-            use_h100:
-                type: boolean
-                required: false
-            use_h200:
-                type: boolean
-                required: false
-            use_b200:
-                type: boolean
-                required: false
-            use_mi300x:
-                type: boolean
-                required: false
-            use_mi325x:
-                type: boolean
-                required: false
-            use_mi355x:
-                type: boolean
-                required: false
-            use_gb200:
-                type: boolean
-                required: false
-
-jobs:
-    get-configs:
-        runs-on: ubuntu-latest
-        outputs:
-            dsr1-1k1k: ${{ steps.generate-configs.outputs.dsr1-1k1k }}
-            dsr1-1k8k: ${{ steps.generate-configs.outputs.dsr1-1k8k }}
-            dsr1-8k1k: ${{ steps.generate-configs.outputs.dsr1-8k1k }}
-            gptoss-1k1k: ${{ steps.generate-configs.outputs.gptoss-1k1k }}
-            gptoss-1k8k: ${{ steps.generate-configs.outputs.gptoss-1k8k }}
-            gptoss-8k1k: ${{ steps.generate-configs.outputs.gptoss-8k1k }}
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - id: generate-configs
-              run: |
-                  pip install pydantic
-
-                  set -x
-                  # Build runner type filters based on inputs
-                  RUNNER_TYPES=""
-
-                  if [ "${{ inputs.use_h100 }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} h100"
-                  fi
-                  if [ "${{ inputs.use_h200 }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} h200 h200-trt"
-                  fi
-                  if [ "${{ inputs.use_b200 }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} b200 b200-trt b200-nvs"
-                  fi
-                  if [ "${{ inputs.use_mi300x }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} mi300x"
-                  fi
-                  if [ "${{ inputs.use_mi325x }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} mi325x"
-                  fi
-                  if [ "${{ inputs.use_mi355x }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} mi355x"
-                  fi
-                  if [ "${{ inputs.use_gb200 }}" = "true" ]; then
-                      RUNNER_TYPES="${RUNNER_TYPES} gb200"
-                  fi
-
-                  # Trim leading whitespace
-                  RUNNER_TYPES=$(echo $RUNNER_TYPES | xargs)
-
-                  # DSR1 doesn't support H100, so exclude it
-                  DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs)
-
-                  # Generate dsr1 configs (only if we have valid runner types for DSR1)
-                  if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
-                      DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                      echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT
-                  else
-                      echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT
-                  fi
-
-                  if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
-                      DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                      echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT
-                  else
-                      echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT
-                  fi
-
-                  if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then
-                      DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                      echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT
-                  else
-                      echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT
-                  fi
-
-                  # Generate gptoss configs (only if we have runner types selected)
-                  if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
-                      GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                      echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT
-                  else
-                      echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT
-                  fi
-
-                  if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
-                      GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                      echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT
-                  else
-                      echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT
-                  fi
-
-                  if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then
-                      GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml)
-                      echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT
-                  else
-                      echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT
-                  fi
-
-    # DSR1 1K1K Benchmarks
-    benchmark-dsr1-1k1k:
-        needs: get-configs
-        if: ${{ needs.get-configs.outputs.dsr1-1k1k != '[]' }}
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k1k) }}
-        secrets: inherit
-        with:
-            exp-name: ${{ matrix.config.exp-name }}
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-
-    collect-dsr1-1k1k-results:
-        needs: benchmark-dsr1-1k1k
-        if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "dsr1_1k1k"
-
-    # GPTOSS 1K1K Benchmarks
-    benchmark-gptoss-1k1k:
-        needs: get-configs
-        if: ${{ needs.get-configs.outputs.gptoss-1k1k != '[]' }}
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k1k) }}
-        secrets: inherit
-        with:
-            exp-name: ${{ matrix.config.exp-name }}
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-
-    collect-gptoss-1k1k-results:
-        needs: benchmark-gptoss-1k1k
-        if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "gptoss_1k1k"
-
-    # DSR1 8K1K Benchmarks
-    benchmark-dsr1-8k1k:
-        needs: get-configs
-        if: ${{ needs.get-configs.outputs.dsr1-8k1k != '[]' }}
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-configs.outputs.dsr1-8k1k) }}
-        secrets: inherit
-        with:
-            exp-name: ${{ matrix.config.exp-name }}
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-
-    collect-dsr1-8k1k-results:
-        needs: benchmark-dsr1-8k1k
-        if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "dsr1_8k1k"
-
-    # GPTOSS 8K1K Benchmarks
-    benchmark-gptoss-8k1k:
-        needs: get-configs
-        if: ${{ needs.get-configs.outputs.gptoss-8k1k != '[]' }}
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-configs.outputs.gptoss-8k1k) }}
-        secrets: inherit
-        with:
-            exp-name: ${{ matrix.config.exp-name }}
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-
-    collect-gptoss-8k1k-results:
-        needs: benchmark-gptoss-8k1k
-        if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "gptoss_8k1k"
-
-    # DSR1 1K8K Benchmarks
-    benchmark-dsr1-1k8k:
-        needs: get-configs
-        if: ${{ needs.get-configs.outputs.dsr1-1k8k != '[]' }}
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k8k) }}
-        secrets: inherit
-        with:
-            exp-name: ${{ matrix.config.exp-name }}
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-
-    # This is a workaround until we can integrate GB200 into master configs.
-    benchmark-gb200-1k1k:
-        if: ${{ inputs.use_gb200 && inputs.run_1k1k }}
-        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-        name: gb200 1k1k sweep
-        strategy:
-            fail-fast: false
-            matrix:
-                config: &dsr1_static_configs
-                    - {
-                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
-                          "model": "deepseek-r1-fp4",
-                          "model-prefix": "dsr1",
-                          "precision": "fp4",
-                          "framework": "dynamo-trtllm",
-                          "mtp": "off",
-                      }
-                    - {
-                          "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3",
-                          "model": "deepseek-r1-fp4",
-                          "model-prefix": "dsr1",
-                          "precision": "fp4",
-                          "framework": "dynamo-trtllm",
-                          "mtp": "on",
-                      }
-                    - {
-                          "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1",
-                          "model": "deepseek-ai/DeepSeek-R1-0528",
-                          "model-prefix": "dsr1",
-                          "precision": "fp8",
-                          "framework": "dynamo-sglang",
-                          "mtp": "off",
-                      }
-        secrets: inherit
-        with:
-            runner: gb200
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            exp-name: ${{ matrix.config.model-prefix }}_1k1k
-            isl: 1024
-            osl: 1024
-            max-model-len: 2048
-            mtp-mode: ${{ matrix.config.mtp }}
-
-    benchmark-gb200-1k8k:
-        if: ${{ inputs.use_gb200 && inputs.run_1k8k }}
-        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-        name: gb200 1k1k sweep
-        strategy:
-            fail-fast: false
-            matrix:
-                config: *dsr1_static_configs
-        secrets: inherit
-        with:
-            runner: gb200
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            exp-name: ${{ matrix.config.model-prefix }}_1k8k
-            isl: 1024
-            osl: 8192
-            max-model-len: 9216
-            mtp-mode: ${{ matrix.config.mtp }}
-
-    benchmark-gb200-8k1k:
-        if: ${{ inputs.use_gb200 && inputs.run_8k1k }}
-        uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-        name: gb200 1k1k sweep
-        strategy:
-            fail-fast: false
-            matrix:
-                config: *dsr1_static_configs
-        secrets: inherit
-        with:
-            runner: gb200
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            exp-name: ${{ matrix.config.model-prefix }}_8k1k
-            isl: 1024
-            osl: 8192
-            max-model-len: 9216
-            mtp-mode: ${{ matrix.config.mtp }}
-
-    collect-dsr1-1k8k-results:
-        needs: [benchmark-dsr1-1k8k, benchmark-gb200-1k1k, benchmark-gb200-1k8k, benchmark-gb200-8k1k]
-        if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "dsr1_1k8k"
-
-    # GPTOSS 1K8K Benchmarks
-    benchmark-gptoss-1k8k:
-        needs: get-configs
-        if: ${{ needs.get-configs.outputs.gptoss-1k8k != '[]' }}
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k8k) }}
-        secrets: inherit
-        with:
-            exp-name: ${{ matrix.config.exp-name }}
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep }}
-            dp-attn: ${{ matrix.config.dp-attn }}
-            conc: ${{ matrix.config.conc }}
-
-    collect-gptoss-1k8k-results:
-        needs: benchmark-gptoss-1k8k
-        if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }}
-        uses: ./.github/workflows/collect-results.yml
-        secrets: inherit
-        with:
-            exp-name: "gptoss_1k8k"
-
-    calc-success-rate:
-        needs:
-            [
-                collect-dsr1-1k1k-results,
-                collect-dsr1-1k8k-results,
-                collect-dsr1-8k1k-results,
-                collect-gptoss-1k1k-results,
-                collect-gptoss-1k8k-results,
-                collect-gptoss-8k1k-results,
-            ]
-        if: ${{ always() }}
-        runs-on: ubuntu-latest
-
-        env:
-            RESULTS_DIR: "results/"
-            STATS_FILENAME: "run_stats"
-            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
-
-        steps:
-            - uses: actions/checkout@v3
-              with:
-                  token: ${{ secrets.REPO_PAT }}
-                  fetch-depth: 0
-
-            - name: Download results artifacts
-              uses: actions/download-artifact@v4
-              with:
-                  path: ${{ env.RESULTS_DIR }}
-                  pattern: results_*
-
-            - name: Install python dependencies
-              run: pip install PyGithub
-
-            - name: Calculate success rate
-              run: python3 utils/calc_success_rate.py $STATS_FILENAME
-
-            - uses: actions/upload-artifact@v4
-              with:
-                  name: "run-stats"
-                  path: ${{ env.STATS_FILENAME }}.json

From 04992c4cc79808de725c7a420f25b2edaddb1c82 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 30 Oct 2025 22:10:47 -0500
Subject: [PATCH 112/149] reverting title

---
 .github/workflows/full-sweep-test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index 0787b5c2a..ed3b13f59 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -58,7 +58,7 @@ jobs:
 
                   set -x
                   # Build runner type filters based on inputs
-                  RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200 h200-trt' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}"
+                  RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200' || '' }} ${{ inputs.use_h200 && 'h200 h200-trt' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}"
 
                   # DSR1 doesn't support H100, so exclude it
                   DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs)

From f2f1a5ea16326cc1248799b8c01134f30b255701 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 08:55:53 -0500
Subject: [PATCH 113/149] fixing test files

---
 utils/matrix-logic/test_generate_sweep_configs.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py
index cd5ff5b46..15c5f25a3 100644
--- a/utils/matrix-logic/test_generate_sweep_configs.py
+++ b/utils/matrix-logic/test_generate_sweep_configs.py
@@ -1236,6 +1236,7 @@ def test_generate_runner_sweep_config(sample_master_config, temp_config_files):
 
     class Args:
         model_prefix = "70b"
+        runner_type = "h200"
         precision = None
         framework = None
         runner_config = runner_file
@@ -1250,6 +1251,7 @@ def test_generate_runner_sweep_config_with_filters(sample_master_config, temp_co
 
     class Args:
         model_prefix = "70b"
+        runner_type = "h200"
         precision = "fp8"
         framework = "vllm"
         runner_config = runner_file
@@ -1265,6 +1267,7 @@ def test_generate_runner_sweep_config_no_matches(sample_master_config, temp_conf
 
     class Args:
         model_prefix = "nonexistent"
+        runner_type = "h200"
         precision = None
         framework = None
         runner_config = runner_file
@@ -1393,6 +1396,7 @@ def test_main_runner_sweep(temp_config_files):
         "runner-sweep",
         "--config-files", master_file,
         "--runner-config", runner_file,
+        "--runner-type", "h200",
         "--model-prefix", "70b"
     ]
 

From 9d2cbbba6e75077cda460597a5f39f8d43c4daa0 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 08:57:40 -0500
Subject: [PATCH 114/149] fixing gha syntax error

---
 .github/workflows/full-sweep-test.yml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index ed3b13f59..bbbd574c2 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -9,33 +9,43 @@ on:
         inputs:
             run_1k1k:
                 type: boolean
+                description: "When true, run 1k1k"
                 required: false
             run_8k1k:
                 type: boolean
+                description: "When true, run 8k1k"
                 required: false
             run_1k8k:
                 type: boolean
+                description: "When true, run 1k8k"
                 required: false
             use_h100:
                 type: boolean
+                description: "When true, run H100"
                 required: false
             use_h200:
                 type: boolean
+                description: "When true, run H200"
                 required: false
             use_b200:
                 type: boolean
+                description: "When true, run B200"
                 required: false
             use_mi300x:
                 type: boolean
+                description: "When true, run MI300X"
                 required: false
             use_mi325x:
                 type: boolean
+                description: "When true, run MI325X"
                 required: false
             use_mi355x:
                 type: boolean
+                description: "When true, run MI355X"
                 required: false
             use_gb200:
                 type: boolean
+                description: "When true, run GB200"
                 required: false
 
 jobs:

From 7164cdef8c5bd21e5b8369f9caea24e530e934b8 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 09:52:43 -0500
Subject: [PATCH 115/149] fixing gha syntax error

---
 .github/workflows/e2e-tests.yml       | 1 +
 .github/workflows/full-sweep-test.yml | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index ff7ecb92b..fef12802d 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -1,4 +1,5 @@
 name: End-to-End Tests
+run-name: e2e Test - ${{ github.event.inputs.generate-cli-command }}
 
 # concurrency:
 #     group: benchmark-lock
diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index bbbd574c2..9647dd21d 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -62,6 +62,8 @@ jobs:
             - name: Checkout code
               uses: actions/checkout@v4
 
+              # This looks complicated, but it is just calling generate_sweep_configs.py conditioned on 
+              # discrete inputs (i.e., run_1k1k, run_h100, etc.) to split the test sweep into discrete jobs 
             - id: generate-configs
               run: |
                   pip install pydantic

From 5eb1f90d4e32c9de5088098f0f84222008f5a5a6 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 11:24:50 -0500
Subject: [PATCH 116/149] fixing error in multinode script

---
 .github/workflows/benchmark-multinode-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 0386e7d55..b4d917575 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -94,7 +94,7 @@ jobs:
               # Extract GPU count from filename for tp_size calculation
               gpus=$(echo "$result_file" | sed "s/.*_gpus\([0-9]*\)\.json/\1/")
               if [ -n "$gpus" ]; then
-                python3 utils/process_result.py ${{ inputs.runner }} $gpus ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE
+                python3 utils/process_result.py ${{ inputs.runner }} $gpus 1 false ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE
               fi
             fi
           done

From 9318ba761044e28b6702f79e020209061c6ba5c9 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 11:25:16 -0500
Subject: [PATCH 117/149] bug fxes

---
 .github/workflows/full-sweep-test.yml | 14 ++------------
 1 file changed, 2 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index 9647dd21d..0f8771b54 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -9,43 +9,33 @@ on:
         inputs:
             run_1k1k:
                 type: boolean
-                description: "When true, run 1k1k"
                 required: false
             run_8k1k:
                 type: boolean
-                description: "When true, run 8k1k"
                 required: false
             run_1k8k:
                 type: boolean
-                description: "When true, run 1k8k"
                 required: false
             use_h100:
                 type: boolean
-                description: "When true, run H100"
                 required: false
             use_h200:
                 type: boolean
-                description: "When true, run H200"
                 required: false
             use_b200:
                 type: boolean
-                description: "When true, run B200"
                 required: false
             use_mi300x:
                 type: boolean
-                description: "When true, run MI300X"
                 required: false
             use_mi325x:
                 type: boolean
-                description: "When true, run MI325X"
                 required: false
             use_mi355x:
                 type: boolean
-                description: "When true, run MI355X"
                 required: false
             use_gb200:
                 type: boolean
-                description: "When true, run GB200"
                 required: false
 
 jobs:
@@ -325,7 +315,7 @@ jobs:
     benchmark-gb200-1k8k:
         if: ${{ inputs.use_gb200 && inputs.run_1k8k }}
         uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-        name: gb200 1k1k sweep
+        name: gb200 1k8k sweep
         strategy:
             fail-fast: false
             matrix:
@@ -346,7 +336,7 @@ jobs:
     benchmark-gb200-8k1k:
         if: ${{ inputs.use_gb200 && inputs.run_8k1k }}
         uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-        name: gb200 1k1k sweep
+        name: gb200 8k1k sweep
         strategy:
             fail-fast: false
             matrix:

From 5a56794b517a3f77240964c472c1d5c581e9ce0b Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 11:26:30 -0500
Subject: [PATCH 118/149] debug

---
 .github/workflows/1k1k-sweep.yml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index bc5305460..20e1f0c2d 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -5,7 +5,9 @@ concurrency:
     cancel-in-progress: false
 
 on:
-    # pull_request:
+    push:
+        branches:
+        - initial-refactor
     workflow_dispatch:
 #   schedule:
 #     - cron: '0 23 * * *'

From 912d70d3a9bf65ce847f369d2d29c2303cb51df0 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 11:26:36 -0500
Subject: [PATCH 119/149] debug

---
 .github/workflows/1k1k-sweep.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 20e1f0c2d..c1d3ff72b 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -5,9 +5,6 @@ concurrency:
     cancel-in-progress: false
 
 on:
-    push:
-        branches:
-        - initial-refactor
     workflow_dispatch:
 #   schedule:
 #     - cron: '0 23 * * *'

From 98362f1119ee4a1435fdcae8ec5d4b28d5ef666b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 11:57:49 -0500
Subject: [PATCH 120/149] celaning up the full sweep sched

---
 .github/workflows/1k1k-sweep.yml | 4 ++--
 .github/workflows/1k8k-sweep.yml | 9 ++++-----
 .github/workflows/8k1k-sweep.yml | 9 ++++-----
 3 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index c1d3ff72b..f6ec37562 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -6,8 +6,8 @@ concurrency:
 
 on:
     workflow_dispatch:
-#   schedule:
-#     - cron: '0 23 * * *'
+    schedule:
+        - cron: "0 23 * * *"
 
 jobs:
     get-dsr1-configs:
diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index da4d1daf3..82bc48817 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -1,14 +1,13 @@
 name: "Full Sweep Scheduler - 1k8k"
 
 concurrency:
-  group: benchmark-lock-1k8k
-  cancel-in-progress: false
+    group: benchmark-lock-1k8k
+    cancel-in-progress: false
 
 on:
-    # pull_request:
     workflow_dispatch:
-#   schedule:
-#     - cron: '0 23 * * *'
+    schedule:
+        - cron: "0 23 * * *"
 
 jobs:
     get-dsr1-configs:
diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml
index fa3249da7..8863112af 100644
--- a/.github/workflows/8k1k-sweep.yml
+++ b/.github/workflows/8k1k-sweep.yml
@@ -1,14 +1,13 @@
 name: "Full Sweep Scheduler - 8k1k"
 
 concurrency:
-  group: benchmark-lock-8k1k
-  cancel-in-progress: false
+    group: benchmark-lock-8k1k
+    cancel-in-progress: false
 
 on:
-    # pull_request:
     workflow_dispatch:
-#   schedule:
-#     - cron: '0 23 * * *'
+    schedule:
+        - cron: "0 23 * * *"
 
 jobs:
     get-dsr1-configs:

From 1eb74b9b820c2872253132a2f00407d89a6af631 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 11:58:18 -0500
Subject: [PATCH 121/149] celaning up other workflows

---
 .github/workflows/e2e-tests.yml       | 4 ----
 .github/workflows/full-sweep-test.yml | 6 +++---
 2 files changed, 3 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml
index fef12802d..1d13b3a87 100644
--- a/.github/workflows/e2e-tests.yml
+++ b/.github/workflows/e2e-tests.yml
@@ -1,10 +1,6 @@
 name: End-to-End Tests
 run-name: e2e Test - ${{ github.event.inputs.generate-cli-command }}
 
-# concurrency:
-#     group: benchmark-lock
-#     cancel-in-progress: false
-
 on:
     workflow_dispatch:
         inputs:
diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index 0f8771b54..a2ff06d18 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -1,8 +1,8 @@
 name: Test - Full Sweep
 
-# concurrency:
-#     group: benchmark-lock
-#     cancel-in-progress: false
+concurrency:
+    group: benchmark-lock
+    cancel-in-progress: false
 
 on:
     workflow_dispatch:

From f78de57cb04361258a721a7efa80462fbd56f2c5 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 14:43:14 -0500
Subject: [PATCH 122/149] docs

---
 .github/README.md                            | 116 ++++++
 utils/matrix-logic/generate_sweep_configs.py | 407 ++++++++++---------
 2 files changed, 334 insertions(+), 189 deletions(-)
 create mode 100644 .github/README.md

diff --git a/.github/README.md b/.github/README.md
new file mode 100644
index 000000000..f4539dd5d
--- /dev/null
+++ b/.github/README.md
@@ -0,0 +1,116 @@
+# How to Test Workflows
+
+In order to test configurations described in `.github/configs`, the primary workflow file used is `.github/workflows/e2e-tests.yml`. As input, this workflow takes in the CLI arguments for the `utils/matrix-logic/generate_sweep_configs.py` script. The usage for this script is shown below:
+
+```
+usage: generate_sweep_configs.py [-h] {full-sweep,test-config,runner-model-sweep,runner-sweep,custom} ...
+
+Generate benchmark configurations from YAML config files
+
+positional arguments:
+  {full-sweep,test-config,runner-model-sweep,runner-sweep,custom}
+                        Available commands
+    full-sweep          Generate full sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths
+    test-config         Given a config key, run that configuration as specified. Optionally specify --test-mode to only run one parallelism-concurrency pair for the config.
+    runner-model-sweep  Given a runner type, find all configurations matching the type, and     run that configuration on all individual runner nodes for the specified runner type. This is meant to validate
+                        that all runner nodes work on all configurations for a runner type. For instance, to validate that all configs that specify an h200 runner successfully run across all h200 runner
+                        nodes.
+    runner-sweep        Given a model (and optionally a precision and framework), find all configurations matching the inputs, and run those configurations across all compatible runner nodes. This is
+                        meant to validate all runner nodes that should run a particular model can. For instance, this should be used to validate that all runners nodes that should run gptoss-120b
+                        actually do so successfully.
+    custom              Enter custom values
+
+options:
+  -h, --help            show this help message and exit
+```
+
+Instead of explaining each command at a high level, let's just walk through some common testing scenarios and describe how to run them.
+
+**Scenario 1**: I want to change increase the concurrency from 128 to 256 in the 1k1k scenario for the `dsr1-fp4-b200-sglang` config (from `.github/configs/nvidia-master.yaml`) and then test it.
+
+Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the text following command as the text input:
+```
+test-config --key dsr1-fp4-b200-sglang --seq-len 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
+```
+
+If we wanted to also test 1k8k or 8k1k scenarios, we would simply append `1k8k` or `8k1k` to `--seq-len`, respectively.
+
+Further, if we wanted to run that config on *one specific* runner node, we could specify that by appending `--runner-node` to the argument list. Note that if the specified runner node is not compatible with the specified config key (as dictated by `.github/configs/runners.yaml`), then the workflow will error:
+
+```
+test-config --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml --key dsr1-fp4-b200-sglang --seq-len 1k1k --runner-node mi300x-amd_0
+
+ValueError: Runner node 'mi300x-amd_0' is not compatible with config 'dsr1-fp4-b200-sglang' which runs on runner type 'b200'. Available runner nodes for this config are 'b200-nb_0, b200-nb_1, b200-nvd_0, b200-nvd_1, b200-nvd_2, b200-nvd_3, b200-tg_0'.
+```
+
+**Scenario 2**: I just made a change to the `benchmarks/dsr1_fp8_b200_docker.sh` and I need to verify that these changes work across all B200 runners.
+
+Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the text following command as the text input:
+```
+runner-sweep --runner-type b200 --model-prefix dsr1 --precision fp8 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
+```
+
+This will run a test (just the highest available parallelism and lowest available concurrency) for each B200 runner node for each Deepseek config that runs on B200 with fp8 precision. I.e., this can be used to "sweep" across runners for a particular model to test that all runners still work with changes that have been made.
+
+**Scenario 3**: I just upgraded the CUDA drivers on all H200 runners and need to verify that all models that use H200 still work correctly across all H200 nodes.
+
+Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the following command as the text input:
+```
+runner-model-sweep --runner-type h200 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
+```
+
+This will run a test (just the highest available parallelism and lowest available concurrency) for each configuration that specifies the `h200` runner type, across all H200 runner nodes defined in `.github/configs/runners.yaml`.
+
+For example, if you have configs `dsr1-fp8-h200-sglang`, `dsr1-fp8-h200-trt`, and `gptoss-fp4-h200-vllm` that all use `runner: h200`, and you have 8 H200 nodes (`h200-cw_0`, `h200-cw_1`, etc.), this will run all 3 configs on all 8 nodes (24 total test runs).
+
+This is particularly useful when:
+- You've made infrastructure changes to a specific runner type (driver updates, system configuration, Docker setup)
+- You've added new runner nodes and want to validate they work with all existing model configurations
+- You want to verify that all models remain compatible with a specific GPU type after system updates
+
+**Key difference from Scenario 2**:
+- `runner-sweep`: Fix a **model**, sweep across runners → "Does this model work on all its runners?"
+- `runner-model-sweep`: Fix a **runner type**, sweep across models → "Do all models work on this runner type?"
+
+## Additional Use Cases with `full-sweep`
+
+The `full-sweep` command supports multiple filters that can be combined for targeted testing:
+
+**Test all gptoss configurations on B200 with 1k1k sequence lengths:**
+```
+full-sweep --model-prefix gptoss --runner-type b200 --seq-lens 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
+```
+
+**Test all fp8 precision configs across all runners for 1k8k workloads:**
+```
+full-sweep --precision fp8 --seq-lens 1k8k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml
+```
+
+**Test all TRT configs on H200 runners:**
+```
+full-sweep --framework trt --runner-type h200 h200-trt --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
+```
+
+**Quick smoke test of all configs (highest TP, lowest concurrency only):**
+```
+full-sweep --test-mode --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml
+```
+
+**Test specific model on specific hardware with specific sequence lengths:**
+```
+full-sweep --model-prefix dsr1 --runner-type b200 --precision fp4 --framework sglang --seq-lens 1k1k 8k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
+```
+
+## Custom One-off Tests
+
+**Scenario 4**: I want to run a quick test with a custom image, model, or configuration that isn't in the config files yet.
+
+Use the `custom` command to specify all parameters manually:
+```
+custom --runner-label b200-nb_0 --image vllm/vllm-openai:v0.11.0 --model meta-llama/Llama-3.1-70B --framework vllm --precision fp8 --exp-name llama70b_test --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
+```
+
+This runs a single 1k1k test job with your custom parameters on the specified runner node. Useful for:
+- Testing new images before adding them to config files
+- Quick validation of new models
+- Experimenting with different frameworks or precisions
diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py
index c43a1759e..bb0e22911 100644
--- a/utils/matrix-logic/generate_sweep_configs.py
+++ b/utils/matrix-logic/generate_sweep_configs.py
@@ -4,6 +4,33 @@
 from pydantic import BaseModel, Field, ValidationError, ConfigDict
 from typing import List
 
+# Field name constants
+# Top-level config fields
+FIELD_IMAGE = 'image'
+FIELD_MODEL = 'model'
+FIELD_MODEL_PREFIX = 'model-prefix'
+FIELD_PRECISION = 'precision'
+FIELD_FRAMEWORK = 'framework'
+FIELD_RUNNER = 'runner'
+FIELD_SEQ_LEN_CONFIGS = 'seq-len-configs'
+
+# Seq-len-config fields
+FIELD_ISL = 'isl'
+FIELD_OSL = 'osl'
+FIELD_SEARCH_SPACE = 'search-space'
+
+# Search-space/benchmark fields
+FIELD_TP = 'tp'
+FIELD_CONC_START = 'conc-start'
+FIELD_CONC_END = 'conc-end'
+FIELD_EP = 'ep'
+FIELD_DP_ATTN = 'dp-attn'
+
+# Matrix entry fields
+FIELD_CONC = 'conc'
+FIELD_MAX_MODEL_LEN = 'max-model-len'
+FIELD_EXP_NAME = 'exp-name'
+
 seq_len_stoi = {
     "1k1k": (1024, 1024),
     "1k8k": (1024, 8192),
@@ -65,13 +92,13 @@ def validate_master_configs_structure(all_config_data):
     for key, val in all_config_data.items():
         # Check for required top-level fields and their types
         required_fields = {
-            'image': str,
-            'model': str,
-            'model-prefix': str,
-            'precision': str,
-            'framework': str,
-            'runner': str,
-            'seq-len-configs': list
+            FIELD_IMAGE: str,
+            FIELD_MODEL: str,
+            FIELD_MODEL_PREFIX: str,
+            FIELD_PRECISION: str,
+            FIELD_FRAMEWORK: str,
+            FIELD_RUNNER: str,
+            FIELD_SEQ_LEN_CONFIGS: list
         }
 
         for field, expected_type in required_fields.items():
@@ -82,42 +109,42 @@ def validate_master_configs_structure(all_config_data):
                 raise ValueError(
                     f"Field '{field}' must be {expected_type.__name__} for key '{key}', got {type(val[field]).__name__}")
 
-        seq_len_configs = val['seq-len-configs']
+        seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS]
         if len(seq_len_configs) == 0:
             raise ValueError(
-                f"'seq-len-configs' must be a non-empty list for key '{key}'")
+                f"'{FIELD_SEQ_LEN_CONFIGS}' must be a non-empty list for key '{key}'")
 
         # Validate each seq-len-config
         for i, seq_config in enumerate(seq_len_configs):
             # Check isl
-            if 'isl' not in seq_config or seq_config['isl'] is None:
+            if FIELD_ISL not in seq_config or seq_config[FIELD_ISL] is None:
                 raise ValueError(
-                    f"Missing 'isl' in seq-len-config[{i}] for key '{key}'")
-            if not isinstance(seq_config['isl'], int):
+                    f"Missing '{FIELD_ISL}' in seq-len-config[{i}] for key '{key}'")
+            if not isinstance(seq_config[FIELD_ISL], int):
                 raise ValueError(
-                    f"'isl' must be int in seq-len-config[{i}] for key '{key}'")
+                    f"'{FIELD_ISL}' must be int in seq-len-config[{i}] for key '{key}'")
 
             # Check osl
-            if 'osl' not in seq_config or seq_config['osl'] is None:
+            if FIELD_OSL not in seq_config or seq_config[FIELD_OSL] is None:
                 raise ValueError(
-                    f"Missing 'osl' in seq-len-config[{i}] for key '{key}'")
-            if not isinstance(seq_config['osl'], int):
+                    f"Missing '{FIELD_OSL}' in seq-len-config[{i}] for key '{key}'")
+            if not isinstance(seq_config[FIELD_OSL], int):
                 raise ValueError(
-                    f"'osl' must be int in seq-len-config[{i}] for key '{key}'")
+                    f"'{FIELD_OSL}' must be int in seq-len-config[{i}] for key '{key}'")
 
-            bmk_space = seq_config.get('search-space')
+            bmk_space = seq_config.get(FIELD_SEARCH_SPACE)
             if not bmk_space or not isinstance(bmk_space, list) or len(bmk_space) == 0:
                 raise ValueError(
-                    f"Missing or invalid 'search-space' in seq-len-config[{i}] for key '{key}'")
+                    f"Missing or invalid '{FIELD_SEARCH_SPACE}' in seq-len-config[{i}] for key '{key}'")
 
             # Validate each benchmark in search-space
             for j, bmk in enumerate(bmk_space):
                 # Define allowed fields
-                allowed_fields = {'tp', 'conc-start',
-                                  'conc-end', 'ep', 'dp-attn'}
-                required_bmk_fields = {'tp': int,
-                                       'conc-start': int, 'conc-end': int}
-                optional_bmk_fields = {'ep': int, 'dp-attn': bool}
+                allowed_fields = {FIELD_TP, FIELD_CONC_START,
+                                  FIELD_CONC_END, FIELD_EP, FIELD_DP_ATTN}
+                required_bmk_fields = {FIELD_TP: int,
+                                       FIELD_CONC_START: int, FIELD_CONC_END: int}
+                optional_bmk_fields = {FIELD_EP: int, FIELD_DP_ATTN: bool}
 
                 # Check for extra fields
                 extra_fields = set(bmk.keys()) - allowed_fields
@@ -186,98 +213,98 @@ def generate_full_sweep(args, all_config_data):
                 continue
 
         # Filter by precision if specified
-        if args.precision and val['precision'] not in args.precision:
+        if args.precision and val[FIELD_PRECISION] not in args.precision:
             continue
 
         # Filter by framework if specified
-        if args.framework and val['framework'] not in args.framework:
+        if args.framework and val[FIELD_FRAMEWORK] not in args.framework:
             continue
 
         # Filter by runner type if specified
-        if args.runner_type and val['runner'] not in args.runner_type:
+        if args.runner_type and val[FIELD_RUNNER] not in args.runner_type:
             continue
 
-        seq_len_configs = val['seq-len-configs']
-        image = val['image']
-        model = val['model']
-        precision = val['precision']
-        framework = val['framework']
-        runner = val['runner']
-        model_code = val['model-prefix']
+        seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS]
+        image = val[FIELD_IMAGE]
+        model = val[FIELD_MODEL]
+        precision = val[FIELD_PRECISION]
+        framework = val[FIELD_FRAMEWORK]
+        runner = val[FIELD_RUNNER]
+        model_code = val[FIELD_MODEL_PREFIX]
 
         for seq_config in seq_len_configs:
-            isl = seq_config['isl']
-            osl = seq_config['osl']
+            isl = seq_config[FIELD_ISL]
+            osl = seq_config[FIELD_OSL]
 
             # Filter by sequence lengths if specified
             if seq_lens_filter and (isl, osl) not in seq_lens_filter:
                 continue
 
-            bmk_space = seq_config['search-space']
+            bmk_space = seq_config[FIELD_SEARCH_SPACE]
 
             if args.test_mode:
                 # In test mode, use highest TP with lowest concurrency
-                highest_tp_bmk = max(bmk_space, key=lambda x: x['tp'])
-                tp = highest_tp_bmk['tp']
-                conc = highest_tp_bmk['conc-start']
-                ep = highest_tp_bmk.get('ep')
-                dp_attn = highest_tp_bmk.get('dp-attn')
+                highest_tp_bmk = max(bmk_space, key=lambda x: x[FIELD_TP])
+                tp = highest_tp_bmk[FIELD_TP]
+                conc = highest_tp_bmk[FIELD_CONC_START]
+                ep = highest_tp_bmk.get(FIELD_EP)
+                dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN)
 
                 seq_len_str = seq_len_to_str(isl, osl)
                 entry = {
-                    'image': image,
-                    'model': model,
-                    'precision': precision,
-                    'framework': framework,
-                    'runner': runner,
-                    'isl': isl,
-                    'osl': osl,
-                    'tp': tp,
-                    'ep': 1,  # Default
-                    'dp-attn': False,  # Default
-                    'conc': conc,
-                    'max-model-len': isl + osl + 200,
-                    'exp-name': f"{model_code}_{seq_len_str}",
+                    FIELD_IMAGE: image,
+                    FIELD_MODEL: model,
+                    FIELD_PRECISION: precision,
+                    FIELD_FRAMEWORK: framework,
+                    FIELD_RUNNER: runner,
+                    FIELD_ISL: isl,
+                    FIELD_OSL: osl,
+                    FIELD_TP: tp,
+                    FIELD_EP: 1,  # Default
+                    FIELD_DP_ATTN: False,  # Default
+                    FIELD_CONC: conc,
+                    FIELD_MAX_MODEL_LEN: isl + osl + 200,
+                    FIELD_EXP_NAME: f"{model_code}_{seq_len_str}",
                 }
 
                 if ep is not None:
-                    entry['ep'] = ep
+                    entry[FIELD_EP] = ep
                 if dp_attn is not None:
-                    entry['dp-attn'] = dp_attn
+                    entry[FIELD_DP_ATTN] = dp_attn
 
                 matrix_values.append(entry)
             else:
                 # Full sweep mode
                 for bmk in bmk_space:
-                    tp = bmk['tp']
-                    conc_start = bmk['conc-start']
-                    conc_end = bmk['conc-end']
-                    ep = bmk.get('ep')
-                    dp_attn = bmk.get('dp-attn')
+                    tp = bmk[FIELD_TP]
+                    conc_start = bmk[FIELD_CONC_START]
+                    conc_end = bmk[FIELD_CONC_END]
+                    ep = bmk.get(FIELD_EP)
+                    dp_attn = bmk.get(FIELD_DP_ATTN)
 
                     conc = conc_start
                     while conc <= conc_end:
                         seq_len_str = seq_len_to_str(isl, osl)
                         entry = {
-                            'image': image,
-                            'model': model,
-                            'precision': precision,
-                            'framework': framework,
-                            'runner': runner,
-                            'isl': isl,
-                            'osl': osl,
-                            'tp': tp,
-                            'conc': conc,
-                            'max-model-len': isl + osl + 200,
-                            'ep': 1,  # Default
-                            'dp-attn': False,  # Default
-                            'exp-name': f"{model_code}_{seq_len_str}",
+                            FIELD_IMAGE: image,
+                            FIELD_MODEL: model,
+                            FIELD_PRECISION: precision,
+                            FIELD_FRAMEWORK: framework,
+                            FIELD_RUNNER: runner,
+                            FIELD_ISL: isl,
+                            FIELD_OSL: osl,
+                            FIELD_TP: tp,
+                            FIELD_CONC: conc,
+                            FIELD_MAX_MODEL_LEN: isl + osl + 200,
+                            FIELD_EP: 1,  # Default
+                            FIELD_DP_ATTN: False,  # Default
+                            FIELD_EXP_NAME: f"{model_code}_{seq_len_str}",
                         }
 
                         if ep is not None:
-                            entry['ep'] = ep
+                            entry[FIELD_EP] = ep
                         if dp_attn is not None:
-                            entry['dp-attn'] = dp_attn
+                            entry[FIELD_DP_ATTN] = dp_attn
 
                         matrix_values.append(entry)
 
@@ -323,20 +350,20 @@ def generate_test_config(args, all_config_data):
             f"Specified key '{args.key}' does not exist in config files.")
 
     # Extract model code from config
-    model_code = val['model-prefix']
+    model_code = val[FIELD_MODEL_PREFIX]
 
-    runner_nodes = runner_config.get(val['runner'], [])
-    if args.runner_node not in runner_nodes:
+    runner_nodes = runner_config.get(val[FIELD_RUNNER], [])
+    if args.runner_node and args.runner_node not in runner_nodes:
         raise ValueError(
-            f"Runner node '{args.runner_node}' is not compatible with config '{args.key}' which runs on runner type '{val['runner']}'. Available runner nodes for this config are '{', '.join(runner_nodes)}'.")
+            f"Runner node '{args.runner_node}' is not compatible with config '{args.key}' which runs on runner type '{val[FIELD_RUNNER]}'. Available runner nodes for this config are '{', '.join(runner_nodes)}'.")
 
-    seq_len_configs = val['seq-len-configs']
-    image = val['image']
-    model = val['model']
-    precision = val['precision']
-    framework = val['framework']
+    seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS]
+    image = val[FIELD_IMAGE]
+    model = val[FIELD_MODEL]
+    precision = val[FIELD_PRECISION]
+    framework = val[FIELD_FRAMEWORK]
     # Use default runner or specific runner node if input by user
-    runner = val['runner'] if not args.runner_node else args.runner_node
+    runner = val[FIELD_RUNNER] if not args.runner_node else args.runner_node
 
     # Convert seq-lens to set of (isl, osl) tuples for filtering
     seq_lens_filter = None
@@ -347,71 +374,73 @@ def generate_test_config(args, all_config_data):
 
     # Process each sequence length configuration
     for seq_config in seq_len_configs:
-        isl = seq_config['isl']
-        osl = seq_config['osl']
+        isl = seq_config[FIELD_ISL]
+        osl = seq_config[FIELD_OSL]
 
         # Filter by sequence lengths if specified
         if seq_lens_filter and (isl, osl) not in seq_lens_filter:
             continue
 
-        bmk_space = seq_config['search-space']
+        bmk_space = seq_config[FIELD_SEARCH_SPACE]
 
         for bmk in bmk_space:
-            tp = bmk['tp']
-            conc_start = bmk['conc-start']
-            conc_end = bmk['conc-end']
-            ep = bmk.get('ep')
-            dp_attn = bmk.get('dp-attn')
+            tp = bmk[FIELD_TP]
+            conc_start = bmk[FIELD_CONC_START]
+            conc_end = bmk[FIELD_CONC_END]
+            ep = bmk.get(FIELD_EP)
+            dp_attn = bmk.get(FIELD_DP_ATTN)
 
             # In test mode, only use the lowest concurrency (conc_start)
             if args.test_mode:
                 entry = {
-                    'image': image,
-                    'model': model,
-                    'precision': precision,
-                    'framework': framework,
-                    'runner': runner,
-                    'isl': isl,
-                    'osl': osl,
-                    'tp': tp,
-                    'ep': 1, # Default,
-                    'dp-attn': False, # Default
-                    'conc': conc_start,
-                    'max-model-len': isl + osl,
-                    'exp-name': f"{model_code}_test",
+                    FIELD_IMAGE: image,
+                    FIELD_MODEL: model,
+                    FIELD_PRECISION: precision,
+                    FIELD_FRAMEWORK: framework,
+                    FIELD_RUNNER: runner,
+                    FIELD_ISL: isl,
+                    FIELD_OSL: osl,
+                    FIELD_TP: tp,
+                    FIELD_EP: 1, # Default,
+                    FIELD_DP_ATTN: False, # Default
+                    FIELD_CONC: conc_start,
+                    FIELD_MAX_MODEL_LEN: isl + osl,
+                    FIELD_EXP_NAME: f"{model_code}_test",
                 }
 
                 # Add optional fields if they exist
                 if ep is not None:
-                    entry['ep'] = ep
+                    entry[FIELD_EP] = ep
                 if dp_attn is not None:
-                    entry['dp-attn'] = dp_attn
+                    entry[FIELD_DP_ATTN] = dp_attn
 
                 matrix_values.append(entry)
             else:
                 # Generate entries for each concurrency value in the range
                 conc = conc_start
                 while conc <= conc_end:
+                    seq_len_str = seq_len_to_str(isl, osl)
                     entry = {
-                        'image': image,
-                        'model': model,
-                        'precision': precision,
-                        'framework': framework,
-                        'runner': runner,
-                        'isl': isl,
-                        'osl': osl,
-                        'tp': tp,
-                        'ep': 1, # Default,
-                        'dp-attn': False, # Default
-                        'conc': conc,
-                        'max-model-len': isl + osl,
+                        FIELD_IMAGE: image,
+                        FIELD_MODEL: model,
+                        FIELD_PRECISION: precision,
+                        FIELD_FRAMEWORK: framework,
+                        FIELD_RUNNER: runner,
+                        FIELD_ISL: isl,
+                        FIELD_OSL: osl,
+                        FIELD_TP: tp,
+                        FIELD_EP: 1, # Default,
+                        FIELD_DP_ATTN: False, # Default
+                        FIELD_CONC: conc,
+                        FIELD_MAX_MODEL_LEN: isl + osl,
+                        FIELD_EXP_NAME: f"{model_code}_{seq_len_str}",
                     }
 
                     # Add optional fields if they exist
                     if ep is not None:
-                        entry['ep'] = ep
+                        entry[FIELD_EP] = ep
                     if dp_attn is not None:
-                        entry['dp-attn'] = dp_attn
+                        entry[FIELD_DP_ATTN] = dp_attn
 
                     matrix_values.append(entry)
 
@@ -445,52 +474,52 @@ def generate_runner_model_sweep_config(args, all_config_data):
     matrix_values = []
     for key, val in all_config_data.items():
         # Only consider configs with specified runner
-        if val['runner'] != args.runner_type:
+        if val[FIELD_RUNNER] != args.runner_type:
             continue
 
         # Get model code for exp_name
-        model_code = val['model-prefix']
+        model_code = val[FIELD_MODEL_PREFIX]
 
         # Find 1k1k config
         target_config = None
-        for config in val['seq-len-configs']:
-            if config['isl'] == 1024 and config['osl'] == 1024:
+        for config in val[FIELD_SEQ_LEN_CONFIGS]:
+            if config[FIELD_ISL] == 1024 and config[FIELD_OSL] == 1024:
                 target_config = config
                 break
 
-        highest_tp_bmk = max(target_config['search-space'], key=lambda x: x['tp'])
+        highest_tp_bmk = max(target_config[FIELD_SEARCH_SPACE], key=lambda x: x[FIELD_TP])
         # Since we are just testing, pick the highest TP for this config and just test
         # on that TP with the lowest concurrency available
-        highest_tp = highest_tp_bmk['tp']
-        lowest_conc = highest_tp_bmk['conc-start']
+        highest_tp = highest_tp_bmk[FIELD_TP]
+        lowest_conc = highest_tp_bmk[FIELD_CONC_START]
 
-        ep = highest_tp_bmk.get('ep')
-        dp_attn = highest_tp_bmk.get('dp-attn')
+        ep = highest_tp_bmk.get(FIELD_EP)
+        dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN)
 
         for node in runner_nodes:
             entry = {
-                'image': val['image'],
-                'model': val['model'],
-                'precision': val['precision'],
-                'framework': val['framework'],
+                FIELD_IMAGE: val[FIELD_IMAGE],
+                FIELD_MODEL: val[FIELD_MODEL],
+                FIELD_PRECISION: val[FIELD_PRECISION],
+                FIELD_FRAMEWORK: val[FIELD_FRAMEWORK],
                 # Add one entry for each node under specified runner type
-                'runner': node,
+                FIELD_RUNNER: node,
                 # Again, just use 1k1k since this is just meant to smoke test all runners
-                'isl': 1024,
-                'osl': 1024,
-                'tp': highest_tp,
-                'ep': 1, # Default,
-                'dp-attn': False, # Default
-                'conc': lowest_conc,
-                'max-model-len': 2048,
-                'exp-name': f"{model_code}_test",
+                FIELD_ISL: 1024,
+                FIELD_OSL: 1024,
+                FIELD_TP: highest_tp,
+                FIELD_EP: 1, # Default,
+                FIELD_DP_ATTN: False, # Default
+                FIELD_CONC: lowest_conc,
+                FIELD_MAX_MODEL_LEN: 2048,
+                FIELD_EXP_NAME: f"{model_code}_test",
             }
 
             # Add optional fields if they exist
             if ep is not None:
-                entry['ep'] = ep
+                entry[FIELD_EP] = ep
             if dp_attn is not None:
-                entry['dp-attn'] = dp_attn
+                entry[FIELD_DP_ATTN] = dp_attn
 
             matrix_values.append(entry)
 
@@ -521,20 +550,20 @@ def generate_custom_test(args):
 
     return [
         {
-            'image': args.image,
-            'model': args.model,
-            'precision': args.precision,
-            'framework': args.framework,
-            'runner': args.runner_label,
+            FIELD_IMAGE: args.image,
+            FIELD_MODEL: args.model,
+            FIELD_PRECISION: args.precision,
+            FIELD_FRAMEWORK: args.framework,
+            FIELD_RUNNER: args.runner_label,
             # Again, just use 1k1k since this is just meant to smoke test all runners
-            'isl': 1024,
-            'osl': 1024,
-            'tp': 8,
-            'ep': 1,
-            'dp-attn': False,
-            'conc': 4,
-            'exp-name': args.exp_name,
-            'max-model-len': 2048,
+            FIELD_ISL: 1024,
+            FIELD_OSL: 1024,
+            FIELD_TP: 8,
+            FIELD_EP: 1,
+            FIELD_DP_ATTN: False,
+            FIELD_CONC: 4,
+            FIELD_EXP_NAME: args.exp_name,
+            FIELD_MAX_MODEL_LEN: 2048,
         }
     ]
 
@@ -561,62 +590,62 @@ def generate_runner_sweep_config(args, all_config_data):
         # Only consider configs with specified runner
         if not key.startswith(args.model_prefix):
             continue
-        
-        if not val['runner'] == args.runner_type:
+
+        if not val[FIELD_RUNNER] == args.runner_type:
             continue
 
         # Optionally filter by precision and framework
-        if (args.precision and val['precision'] != args.precision) or (args.framework and val['framework'] != args.framework):
+        if (args.precision and val[FIELD_PRECISION] != args.precision) or (args.framework and val[FIELD_FRAMEWORK] != args.framework):
             continue
 
         # Get model code for exp_name
-        model_code = val['model-prefix']
+        model_code = val[FIELD_MODEL_PREFIX]
 
-        runner_nodes = runner_config.get(val['runner'])
+        runner_nodes = runner_config.get(val[FIELD_RUNNER])
         if not runner_nodes:
             raise ValueError(
-                f"Runner '{val['runner']}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.")
+                f"Runner '{val[FIELD_RUNNER]}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.")
 
         # Find 1k1k config
         target_config = None
-        for config in val['seq-len-configs']:
-            if config['isl'] == 1024 and config['osl'] == 1024:
+        for config in val[FIELD_SEQ_LEN_CONFIGS]:
+            if config[FIELD_ISL] == 1024 and config[FIELD_OSL] == 1024:
                 target_config = config
                 break
 
-        highest_tp_bmk = max(target_config['search-space'], key=lambda x: x['tp'])
+        highest_tp_bmk = max(target_config[FIELD_SEARCH_SPACE], key=lambda x: x[FIELD_TP])
         # Since we are just testing, pick the highest TP for this config and just test
         # on that TP with the lowest concurrency available
-        highest_tp = highest_tp_bmk['tp']
-        lowest_conc = highest_tp_bmk['conc-start']
+        highest_tp = highest_tp_bmk[FIELD_TP]
+        lowest_conc = highest_tp_bmk[FIELD_CONC_START]
 
-        ep = highest_tp_bmk.get('ep')
-        dp_attn = highest_tp_bmk.get('dp-attn')
+        ep = highest_tp_bmk.get(FIELD_EP)
+        dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN)
 
         for node in runner_nodes:
             entry = {
-                'image': val['image'],
-                'model': val['model'],
-                'precision': val['precision'],
-                'framework': val['framework'],
+                FIELD_IMAGE: val[FIELD_IMAGE],
+                FIELD_MODEL: val[FIELD_MODEL],
+                FIELD_PRECISION: val[FIELD_PRECISION],
+                FIELD_FRAMEWORK: val[FIELD_FRAMEWORK],
                 # Add one entry for each node under specified runner type
-                'runner': node,
+                FIELD_RUNNER: node,
                 # Again, just use 1k1k since this is just meant to smoke test all runners
-                'isl': 1024,
-                'osl': 1024,
-                'tp': highest_tp,
-                'ep': 1, # Default,
-                'dp-attn': False, # Default
-                'conc': lowest_conc,
-                'exp-name': f"{model_code}_test",
-                'max-model-len': 2048,
+                FIELD_ISL: 1024,
+                FIELD_OSL: 1024,
+                FIELD_TP: highest_tp,
+                FIELD_EP: 1, # Default,
+                FIELD_DP_ATTN: False, # Default
+                FIELD_CONC: lowest_conc,
+                FIELD_EXP_NAME: f"{model_code}_test",
+                FIELD_MAX_MODEL_LEN: 2048,
             }
 
             # Add optional fields if they exist
             if ep is not None:
-                entry['ep'] = ep
+                entry[FIELD_EP] = ep
             if dp_attn is not None:
-                entry['dp-attn'] = dp_attn
+                entry[FIELD_DP_ATTN] = dp_attn
 
             matrix_values.append(entry)
 

From d233ea2fa50f641aad99a6246bc63b79089f560b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 14:54:52 -0500
Subject: [PATCH 123/149] remove concurrency locks

---
 .github/workflows/1k1k-sweep.yml      | 4 ----
 .github/workflows/1k8k-sweep.yml      | 4 ----
 .github/workflows/8k1k-sweep.yml      | 4 ----
 .github/workflows/full-sweep-test.yml | 4 ----
 4 files changed, 16 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index f6ec37562..0930f8a9a 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -1,9 +1,5 @@
 name: "Full Sweep Scheduler - 1k1k"
 
-concurrency:
-    group: benchmark-lock-1k1k
-    cancel-in-progress: false
-
 on:
     workflow_dispatch:
     schedule:
diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index 82bc48817..c3bcf9662 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -1,9 +1,5 @@
 name: "Full Sweep Scheduler - 1k8k"
 
-concurrency:
-    group: benchmark-lock-1k8k
-    cancel-in-progress: false
-
 on:
     workflow_dispatch:
     schedule:
diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml
index 8863112af..fdb6b6112 100644
--- a/.github/workflows/8k1k-sweep.yml
+++ b/.github/workflows/8k1k-sweep.yml
@@ -1,9 +1,5 @@
 name: "Full Sweep Scheduler - 8k1k"
 
-concurrency:
-    group: benchmark-lock-8k1k
-    cancel-in-progress: false
-
 on:
     workflow_dispatch:
     schedule:
diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml
index a2ff06d18..3657971ac 100644
--- a/.github/workflows/full-sweep-test.yml
+++ b/.github/workflows/full-sweep-test.yml
@@ -1,9 +1,5 @@
 name: Test - Full Sweep
 
-concurrency:
-    group: benchmark-lock
-    cancel-in-progress: false
-
 on:
     workflow_dispatch:
         inputs:

From 4e1228b465129a20168c0fbb772bf3a14d13cfea Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 14:56:31 -0500
Subject: [PATCH 124/149] add dpa to results filename

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 571b39888..d1acf16c7 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -127,7 +127,7 @@ jobs:
       - name: Launch job script
         env:
           RUNNER_NAME: ${{ runner.name }}
-          RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_conc${{ env.CONC }}_${{ runner.name }}
+          RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_tp${{ env.TP }}_ep${{ env.EP_SIZE }}_dpa_${{ env.DP_ATTENTION }}_conc${{ env.CONC }}_${{ runner.name }}
         run: |
           bash ./runners/launch_${RUNNER_NAME%%_*}.sh
           if [ -f "$RESULT_FILENAME.json" ]; then

From d816ef46244c8a5ed135692ae46aea83d254c650 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 14:59:49 -0500
Subject: [PATCH 125/149] add back plotting

---
 .github/workflows/collect-results.yml |  12 ++
 utils/plot_perf.py                    | 197 ++++++++++++++++++++++++++
 2 files changed, 209 insertions(+)
 create mode 100644 utils/plot_perf.py

diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml
index 1afe9f049..c1799117e 100644
--- a/.github/workflows/collect-results.yml
+++ b/.github/workflows/collect-results.yml
@@ -35,3 +35,15 @@ jobs:
         with:
           name: results_${{ inputs.exp-name }}
           path: agg_${{ inputs.exp-name }}.json
+
+      - name: Plot performance
+        run: |
+          pip install -q matplotlib
+          python3 utils/plot_perf.py results/ ${{ inputs.exp-name }}
+      - name: Upload performance graphs
+        uses: actions/upload-artifact@v4
+        with:
+          name: graphs_${{ inputs.exp-name }}
+          path: |
+            tput_vs_intvty_*_${{ inputs.exp-name }}.png
+            tput_vs_e2el_*_${{ inputs.exp-name }}.png
diff --git a/utils/plot_perf.py b/utils/plot_perf.py
new file mode 100644
index 000000000..1cab81cdc
--- /dev/null
+++ b/utils/plot_perf.py
@@ -0,0 +1,197 @@
+import sys
+import json
+from pathlib import Path
+import matplotlib.pyplot as plt
+
+
+results_dir = Path(sys.argv[1])
+exp_name = sys.argv[2]
+hw_color = {
+    'h100': 'lightgreen',
+    'h200': 'green',           # H200 VLLM
+    'h200-trt': 'darkgreen',   # H200 TRT-LLM
+    'b200': 'black',            # B200 VLLM
+    'b200-trt': 'gray',      # B200 TRT-LLM
+    'mi300x': 'pink',
+    'mi325x': 'red',
+    'mi355x': 'purple',
+    'gb200': 'orange',          # GB200 TRT-LLM and SGlang
+}
+
+results = []
+for result_path in results_dir.rglob(f'*.json'):
+    with open(result_path) as f:
+        result = json.load(f)
+    results.append(result)
+
+
+def plot_tput_vs_e2el(precision_filter=None):
+    fig, ax = plt.subplots()
+    
+    # Filter results by precision if specified
+    filtered_results = results
+    if precision_filter is not None:
+        filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter]
+
+    for hw_label, color in hw_color.items():
+        # Separate fp8 and fp4 results for this hardware
+        fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
+        fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']
+        
+        # Plot fp8 results with circles
+        if fp8_results:
+            xs_fp8 = [r['median_e2el'] for r in fp8_results]
+            ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
+            ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)
+        
+        # Plot fp4 results with squares
+        if fp4_results:
+            xs_fp4 = [r['median_e2el'] for r in fp4_results]
+            ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
+            ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)
+
+    for result in filtered_results:
+        x, y = result['median_e2el'], result['tput_per_gpu']
+        ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8)
+
+    ax.set_xlabel('End-to-end Latency (s)')
+    ax.set_ylabel('Throughput per GPU (tok/s)')
+    ax.legend(title='GPU Type')
+    fig.tight_layout()
+
+    precision_suffix = f"_{precision_filter}" if precision_filter else ""
+    fig.savefig(f'tput_vs_e2el_{exp_name}{precision_suffix}.png', bbox_inches='tight')
+    plt.close(fig)
+
+
+def plot_tput_vs_intvty(precision_filter=None):
+    fig, ax = plt.subplots()
+    
+    # Filter results by precision if specified
+    filtered_results = results
+    if precision_filter is not None:
+        filtered_results = [r for r in results if r.get('precision', 'fp8') == precision_filter]
+
+    for hw_label, color in hw_color.items():
+        # Separate fp8 and fp4 results for this hardware
+        fp8_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
+        fp4_results = [r for r in filtered_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']
+        
+        # Plot fp8 results with circles
+        if fp8_results:
+            xs_fp8 = [r['median_intvty'] for r in fp8_results]
+            ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
+            ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)
+        
+        # Plot fp4 results with squares
+        if fp4_results:
+            xs_fp4 = [r['median_intvty'] for r in fp4_results]
+            ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
+            ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)
+
+    for result in filtered_results:
+        x, y = result['median_intvty'], result['tput_per_gpu']
+        ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8)
+
+    ax.set_xlabel('Interactivity (tok/s/user)')
+    ax.set_ylabel('Throughput per GPU (tok/s)')
+    ax.legend(title='GPU Type')
+    fig.tight_layout()
+
+    precision_suffix = f"_{precision_filter}" if precision_filter else ""
+    fig.savefig(f'tput_vs_intvty_{exp_name}{precision_suffix}.png', bbox_inches='tight')
+    plt.close(fig)
+
+
+def plot_tput_vs_e2el_for_model(model_results, model_name):
+    fig, ax = plt.subplots()
+    
+    for hw_label, color in hw_color.items():
+        # Separate fp8 and fp4 results for this hardware
+        fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
+        fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']
+        
+        # Plot fp8 results with circles
+        if fp8_results:
+            xs_fp8 = [r['median_e2el'] for r in fp8_results]
+            ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
+            ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)
+        
+        # Plot fp4 results with squares
+        if fp4_results:
+            xs_fp4 = [r['median_e2el'] for r in fp4_results]
+            ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
+            ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)
+
+    for result in model_results:
+        x, y = result['median_e2el'], result['tput_per_gpu']
+        ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8)
+
+    ax.set_xlabel('End-to-end Latency (s)')
+    ax.set_ylabel('Throughput per GPU (tok/s)')
+    ax.legend(title='Hardware + Framework')
+    ax.set_title(f'{model_name} - All Frameworks')
+    fig.tight_layout()
+
+    # Extract model identifier from model name
+    model_id = model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name
+    fig.savefig(f'tput_vs_e2el_{model_id}_{exp_name}.png', bbox_inches='tight')
+    plt.close(fig)
+
+
+def plot_tput_vs_intvty_for_model(model_results, model_name):
+    fig, ax = plt.subplots()
+    
+    for hw_label, color in hw_color.items():
+        # Separate fp8 and fp4 results for this hardware
+        fp8_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp8']
+        fp4_results = [r for r in model_results if r['hw'] == hw_label and r.get('precision', 'fp8') == 'fp4']
+        
+        # Plot fp8 results with circles
+        if fp8_results:
+            xs_fp8 = [r['median_intvty'] for r in fp8_results]
+            ys_fp8 = [r['tput_per_gpu'] for r in fp8_results]
+            ax.scatter(xs_fp8, ys_fp8, label=f"{hw_label.upper()} (fp8)", color=color, marker='o', s=60)
+        
+        # Plot fp4 results with squares
+        if fp4_results:
+            xs_fp4 = [r['median_intvty'] for r in fp4_results]
+            ys_fp4 = [r['tput_per_gpu'] for r in fp4_results]
+            ax.scatter(xs_fp4, ys_fp4, label=f"{hw_label.upper()} (fp4)", color=color, marker='s', s=60)
+
+    for result in model_results:
+        x, y = result['median_intvty'], result['tput_per_gpu']
+        ax.annotate(str(result['tp']), (x, y), textcoords='offset points', xytext=(3, 3), ha='left', fontsize=8)
+
+    ax.set_xlabel('Interactivity (tok/s/user)')
+    ax.set_ylabel('Throughput per GPU (tok/s)')
+    ax.legend(title='Hardware + Framework')
+    ax.set_title(f'{model_name} - All Frameworks')
+    fig.tight_layout()
+
+    # Extract model identifier from model name
+    model_id = model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name
+    fig.savefig(f'tput_vs_intvty_{model_id}_{exp_name}.png', bbox_inches='tight')
+    plt.close(fig)
+
+
+# Create one plot per model showing all frameworks and hardware
+# Group results by model family (70b, dsr1, etc.) instead of full model name
+def get_model_family(model_name):
+    if '70b' in model_name.lower() or 'llama-3.3-70b' in model_name.lower():
+        return '70b'
+    elif 'dsr1' in model_name.lower() or 'deepseek-r1' in model_name.lower():
+        return 'dsr1'
+    else:
+        # Fallback to first part of model name
+        return model_name.split('/')[-1].split('-')[0] if '/' in model_name else model_name
+
+model_families = set(get_model_family(r.get('model', 'unknown')) for r in results)
+
+for model_family in model_families:
+    # Filter results for this model family
+    model_results = [r for r in results if get_model_family(r.get('model', 'unknown')) == model_family]
+    
+    # Create plots for this model family
+    plot_tput_vs_e2el_for_model(model_results, model_family)
+    plot_tput_vs_intvty_for_model(model_results, model_family)

From 249a94c24c7be7ab43d49668ea9f1d264e55dd79 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 18:49:34 -0500
Subject: [PATCH 126/149] testing concurrency


From 6589e53621fae686b97a13e24345ccbf5d0db06d Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 14:42:41 -0500
Subject: [PATCH 127/149] adding more workflows

---
 .github/workflows/test.yml                   | 147 ++++++++++++++++++
 utils/matrix-logic/get_test_sweep_configs.py | 151 +++++++++++++++++++
 2 files changed, 298 insertions(+)
 create mode 100644 .github/workflows/test.yml
 create mode 100644 utils/matrix-logic/get_test_sweep_configs.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 000000000..0d92952da
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,147 @@
+name: Test - Full Sweep
+
+concurrency:
+    group: benchmark-lock
+    cancel-in-progress: false
+
+on:
+    pull_request:
+    workflow_dispatch:
+        inputs:
+            name:
+                description: "Name of benchmark from master configs"
+                required: true
+                type: string
+                default: 70b-fp4-mi355x-vllm
+
+            run_1k1k:
+                description: "Run ISL/OSL 1k/1k"
+                type: boolean
+                required: true
+            run_1k8k:
+                description: "Run ISL/OSL 1k/8k"
+                type: boolean
+                required: true
+            run_8k1k:
+                description: "Run ISL/OSL 8k/1k"
+                type: boolean
+                required: true
+
+            runner:
+                description: "Specific runner node to run on"
+                required: false
+                type: choice
+                options:
+                    - "h100-cr_0"
+                    - "h100-cr_1"
+                    - "h100-cw_0"
+                    - "h100-cw_1"
+                    - "h200-cw_0"
+                    - "h200-cw_1"
+                    - "h200-nb_0"
+                    - "h200-nb_1"
+                    - "h200-nb_2"
+                    - "h200-nb_3"
+                    - "h200-nv_0"
+                    - "h200-nv_1"
+                    - "h200-nv_2"
+                    - "h200-nv_3"
+                    - "b200-nv_0"
+                    - "b200-nv_1"
+                    - "b200-nb_0"
+                    - "b200-nb_1"
+                    - "b200-nvd_0"
+                    - "b200-nvd_1"
+                    - "b200-nvd_2"
+                    - "b200-nvd_3"
+                    - "b200-tg_0"
+                    - "mi300x-amd_0"
+                    - "mi300x-amd_1"
+                    - "mi300x-amd_2"
+                    - "mi300x-amd_3"
+                    - "mi300x-amd_4"
+                    - "mi300x-cr_0"
+                    - "mi300x-oci_0"
+                    - "mi325x-amd_0"
+                    - "mi325x-tw_0"
+                    - "mi325x-tw_1"
+                    - "mi325x-tw_2"
+                    - "mi325x-tw_3"
+                    - "mi355x-amd_0"
+                    - "mi355x-amd_1"
+                    - "mi355x-amd_2"
+                    - "mi355x-amd_3"
+
+jobs:
+    get-jobs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - id: get-jobs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \
+                    --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \
+                    --key ${{ inputs.name }} \
+                    ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }})
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+    test-sweep:
+        needs: get-jobs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: test sweep - ${{ inputs.name }}
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k1k"
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
+
+    calc-success-rate:
+        needs: test-sweep
+        if: ${{ always() }}
+        runs-on: ubuntu-latest
+
+        env:
+            RESULTS_DIR: "results/"
+            STATS_FILENAME: "run_stats"
+            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
+
+        steps:
+            - uses: actions/checkout@v3
+              with:
+                  token: ${{ secrets.REPO_PAT }}
+                  fetch-depth: 0
+
+            - name: Download results artifacts
+              uses: actions/download-artifact@v4
+              with:
+                  path: ${{ env.RESULTS_DIR }}
+                  pattern: results_*
+
+            - name: Install python dependencies
+              run: pip install PyGithub
+
+            - name: Calculate success rate
+              run: python3 utils/calc_success_rate.py $STATS_FILENAME
+
+            - uses: actions/upload-artifact@v4
+              with:
+                  name: "run-stats"
+                  path: ${{ env.STATS_FILENAME }}.json
diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py
new file mode 100644
index 000000000..87ab0457b
--- /dev/null
+++ b/utils/matrix-logic/get_test_sweep_configs.py
@@ -0,0 +1,151 @@
+import json
+import yaml
+import sys
+import argparse
+
+seq_len_stoi = {
+    "1k1k": (1024, 1024),
+    "1k8k": (1024, 8192),
+    "8k1k": (8192, 1024)
+}
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Generate benchmark matrix from a specific configuration key'
+    )
+    parser.add_argument(
+        '--config-files',
+        nargs='+',
+        required=True,
+        help='One or more configuration files (YAML format)'
+    )
+    parser.add_argument(
+        '--key',
+        required=True,
+        help='Configuration key to use'
+    )
+    parser.add_argument(
+        '--seq-lens',
+        nargs='+',
+        choices=list(seq_len_stoi.keys()),
+        required=False,
+        help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included."
+    )
+    parser.add_argument(
+        '--step-size',
+        type=int,
+        default=2,
+        help='Step size for concurrency values (default: 2)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Convert seq-lens to set of (isl, osl) tuples for filtering
+    seq_lens_filter = None
+    if args.seq_lens:
+        seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens}
+    
+    # Load and merge all config files
+    all_config_data = {}
+    for config_file in args.config_files:
+        try:
+            with open(config_file, 'r') as f:
+                config_data = yaml.safe_load(f)
+                assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary"
+                
+                # Check for duplicate keys
+                duplicate_keys = set(all_config_data.keys()) & set(config_data.keys())
+                if duplicate_keys:
+                    raise ValueError(
+                        f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}"
+                    )
+                
+                all_config_data.update(config_data)
+        except FileNotFoundError:
+            raise ValueError(f"Input file '{config_file}' does not exist.")
+    
+    # Check if the key exists
+    if args.key not in all_config_data:
+        available_keys = ', '.join(sorted(all_config_data.keys()))
+        raise ValueError(
+            f"Key '{args.key}' not found in configuration files. "
+            f"Available keys: {available_keys}"
+        )
+    
+    val = all_config_data[args.key]
+    
+    # Validate required fields
+    seq_len_configs = val.get('seq-len-configs')
+    assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'"
+    
+    image = val.get('image')
+    model = val.get('model')
+    precision = val.get('precision')
+    framework = val.get('framework')
+    runner = val.get('runner')
+    
+    assert None not in (image, model, precision, framework, runner), \
+        f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'"
+    
+    matrix_values = []
+    
+    # Process each sequence length configuration
+    for seq_config in seq_len_configs:
+        isl = seq_config.get('isl')
+        osl = seq_config.get('osl')
+        
+        assert None not in (isl, osl), \
+            f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'"
+        
+        # Filter by sequence lengths if specified
+        if seq_lens_filter and (isl, osl) not in seq_lens_filter:
+            continue
+        
+        bmk_space = seq_config.get('bmk-space')
+        assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'"
+        
+        for bmk in bmk_space:
+            tp = bmk.get('tp')
+            conc_start = bmk.get('conc-start')
+            conc_end = bmk.get('conc-end')
+            ep = bmk.get('ep')
+            dp_attn = bmk.get('dp-attn')
+            
+            assert None not in (tp, conc_start, conc_end), \
+                f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'"
+            
+            # Generate entries for each concurrency value in the range
+            conc = conc_start
+            while conc <= conc_end:
+                entry = {
+                    'image': image,
+                    'model': model,
+                    'precision': precision,
+                    'framework': framework,
+                    'runner': runner,
+                    'isl': isl,
+                    'osl': osl,
+                    'tp': tp,
+                    'conc': conc,
+                    'max-model-len': isl + osl,
+                }
+                
+                # Add optional fields if they exist
+                if ep is not None:
+                    entry['ep'] = ep
+                if dp_attn is not None:
+                    entry['dp-attn'] = dp_attn
+                
+                matrix_values.append(entry)
+                
+                if conc == conc_end:
+                    break
+                conc *= args.step_size
+                if conc > conc_end:
+                    conc = conc_end
+    
+    print(json.dumps(matrix_values))
+    return matrix_values
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 3695ed50007215c6342906042d3ab76d8eca7ef2 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 17:05:27 -0500
Subject: [PATCH 128/149] deleting files

---
 .github/workflows/test.yml | 147 -------------------------------------
 1 file changed, 147 deletions(-)
 delete mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
deleted file mode 100644
index 0d92952da..000000000
--- a/.github/workflows/test.yml
+++ /dev/null
@@ -1,147 +0,0 @@
-name: Test - Full Sweep
-
-concurrency:
-    group: benchmark-lock
-    cancel-in-progress: false
-
-on:
-    pull_request:
-    workflow_dispatch:
-        inputs:
-            name:
-                description: "Name of benchmark from master configs"
-                required: true
-                type: string
-                default: 70b-fp4-mi355x-vllm
-
-            run_1k1k:
-                description: "Run ISL/OSL 1k/1k"
-                type: boolean
-                required: true
-            run_1k8k:
-                description: "Run ISL/OSL 1k/8k"
-                type: boolean
-                required: true
-            run_8k1k:
-                description: "Run ISL/OSL 8k/1k"
-                type: boolean
-                required: true
-
-            runner:
-                description: "Specific runner node to run on"
-                required: false
-                type: choice
-                options:
-                    - "h100-cr_0"
-                    - "h100-cr_1"
-                    - "h100-cw_0"
-                    - "h100-cw_1"
-                    - "h200-cw_0"
-                    - "h200-cw_1"
-                    - "h200-nb_0"
-                    - "h200-nb_1"
-                    - "h200-nb_2"
-                    - "h200-nb_3"
-                    - "h200-nv_0"
-                    - "h200-nv_1"
-                    - "h200-nv_2"
-                    - "h200-nv_3"
-                    - "b200-nv_0"
-                    - "b200-nv_1"
-                    - "b200-nb_0"
-                    - "b200-nb_1"
-                    - "b200-nvd_0"
-                    - "b200-nvd_1"
-                    - "b200-nvd_2"
-                    - "b200-nvd_3"
-                    - "b200-tg_0"
-                    - "mi300x-amd_0"
-                    - "mi300x-amd_1"
-                    - "mi300x-amd_2"
-                    - "mi300x-amd_3"
-                    - "mi300x-amd_4"
-                    - "mi300x-cr_0"
-                    - "mi300x-oci_0"
-                    - "mi325x-amd_0"
-                    - "mi325x-tw_0"
-                    - "mi325x-tw_1"
-                    - "mi325x-tw_2"
-                    - "mi325x-tw_3"
-                    - "mi355x-amd_0"
-                    - "mi355x-amd_1"
-                    - "mi355x-amd_2"
-                    - "mi355x-amd_3"
-
-jobs:
-    get-jobs:
-        runs-on: ubuntu-latest
-        outputs:
-            search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - id: get-jobs
-              run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \
-                    --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \
-                    --key ${{ inputs.name }} \
-                    ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }})
-                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
-
-    test-sweep:
-        needs: get-jobs
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: test sweep - ${{ inputs.name }}
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
-        secrets: inherit
-        with:
-            exp-name: "dsr1_1k1k"
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
-            conc: ${{ matrix.config.conc }}
-
-    calc-success-rate:
-        needs: test-sweep
-        if: ${{ always() }}
-        runs-on: ubuntu-latest
-
-        env:
-            RESULTS_DIR: "results/"
-            STATS_FILENAME: "run_stats"
-            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
-
-        steps:
-            - uses: actions/checkout@v3
-              with:
-                  token: ${{ secrets.REPO_PAT }}
-                  fetch-depth: 0
-
-            - name: Download results artifacts
-              uses: actions/download-artifact@v4
-              with:
-                  path: ${{ env.RESULTS_DIR }}
-                  pattern: results_*
-
-            - name: Install python dependencies
-              run: pip install PyGithub
-
-            - name: Calculate success rate
-              run: python3 utils/calc_success_rate.py $STATS_FILENAME
-
-            - uses: actions/upload-artifact@v4
-              with:
-                  name: "run-stats"
-                  path: ${{ env.STATS_FILENAME }}.json

From b328c7f59db27fd41f10331a6e8032401d5d9fb7 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 17:37:58 -0500
Subject: [PATCH 129/149] temp fix (#148)

---
 benchmarks/70b_fp4_b200_trt_slurm.sh | 75 ++++++++++++++++++++++++
 benchmarks/70b_fp8_b200_trt_slurm.sh | 75 ++++++++++++++++++++++++
 benchmarks/70b_fp8_h200_slurm.sh     | 69 ++++++++++++++++++++++
 benchmarks/70b_fp8_h200_trt_slurm.sh | 70 ++++++++++++++++++++++
 benchmarks/70b_fp8_mi325x_slurm.sh   | 86 ++++++++++++++++++++++++++++
 5 files changed, 375 insertions(+)
 create mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh
 create mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh
 create mode 100644 benchmarks/70b_fp8_h200_slurm.sh
 create mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh
 create mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh

diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh
new file mode 100644
index 000000000..ad24453b3
--- /dev/null
+++ b/benchmarks/70b_fp4_b200_trt_slurm.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+
+set -x
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh
new file mode 100644
index 000000000..ad24453b3
--- /dev/null
+++ b/benchmarks/70b_fp8_b200_trt_slurm.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+
+set -x
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+# Launch TRT-LLM server
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh
new file mode 100644
index 000000000..094fbd19c
--- /dev/null
+++ b/benchmarks/70b_fp8_h200_slurm.sh
@@ -0,0 +1,69 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+set -x
+hf download $MODEL
+pip install datasets pandas
+
+# Calculate max-model-len based on ISL and OSL
+if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
+    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
+elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
+    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
+else
+    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
+fi
+
+# Create config.yaml
+cat > config.yaml << EOF
+kv-cache-dtype: fp8
+async-scheduling: true
+no-enable-prefix-caching: true
+max-num-batched-tokens: 8192
+max-model-len: $CALCULATED_MAX_MODEL_LEN
+EOF
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+export TORCH_CUDA_ARCH_LIST="9.0"
+
+PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
+ --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC  \
+ --disable-log-requests > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend vllm \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh
new file mode 100644
index 000000000..dfb2324b9
--- /dev/null
+++ b/benchmarks/70b_fp8_h200_trt_slurm.sh
@@ -0,0 +1,70 @@
+#!/usr/bin/env bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+hf download $MODEL
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+# Create llama-config.yml inline
+# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
+if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
+cat > llama-config.yml << 'EOF'
+batch_wait_max_tokens_ratio: 0.9
+batch_wait_timeout_iters: 20
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+else 
+cat > llama-config.yml << 'EOF'
+cuda_graph_config: 
+  enable_padding: true 
+  max_batch_size: 1024 
+kv_cache_config: 
+  dtype: fp8 
+  enable_block_reuse: false 
+stream_interval: 10
+EOF
+fi
+
+mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend openai \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json
\ No newline at end of file
diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh
new file mode 100644
index 000000000..1febeff13
--- /dev/null
+++ b/benchmarks/70b_fp8_mi325x_slurm.sh
@@ -0,0 +1,86 @@
+#!/usr/bin/bash
+
+# === Required Env Vars === 
+# HF_TOKEN
+# HF_HUB_CACHE
+# IMAGE
+# MODEL
+# ISL
+# OSL
+# MAX_MODEL_LEN
+# RANDOM_RANGE_RATIO
+# TP
+# CONC
+# RESULT_FILENAME
+# PORT_OFFSET
+
+echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+
+huggingface-cli download $MODEL
+
+SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
+PORT=$(( 8888 + $PORT_OFFSET ))
+
+# Reference
+# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
+
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
+if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+    export VLLM_ROCM_USE_AITER_MHA=0
+elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+	if [[ "$CONC" -ge "16" ]]; then
+		export VLLM_ROCM_USE_AITER_MHA=1
+    else
+		export VLLM_ROCM_USE_AITER_MHA=0
+	fi
+fi
+
+# Patch the aiter config script to deal
+# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch.
+file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py'
+sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#'  $file_to_patch
+
+
+# In this specific case, float16 performs better than the datatype
+# picked by vllm when using auto for --dtype (bfloat16).
+set -x
+vllm serve $MODEL --port=$PORT \
+--swap-space=64 \
+--gpu-memory-utilization=0.94 \
+--dtype=float16 --kv-cache-dtype=fp8 \
+--distributed-executor-backend=mp --tensor-parallel-size=$TP \
+--max-model-len=$MAX_MODEL_LEN \
+--max-seq-len-to-capture=$MAX_MODEL_LEN \
+--max-num-seqs=$CONC \
+--max-num-batched-tokens=131072 \
+--no-enable-prefix-caching \
+--config config.yaml \
+--async-scheduling \
+--disable-log-requests \
+> $SERVER_LOG 2>&1 &
+
+set +x
+while IFS= read -r line; do
+    printf '%s\n' "$line"
+    if [[ "$line" == *"Application startup complete"* ]]; then
+        break
+    fi
+done < <(tail -F -n0 "$SERVER_LOG")
+
+set -x
+git clone https://github.com/kimbochen/bench_serving.git
+python3 bench_serving/benchmark_serving.py \
+--model $MODEL --backend vllm \
+--base-url http://0.0.0.0:$PORT \
+--dataset-name random \
+--random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
+--num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
+--request-rate inf --ignore-eos \
+--save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
+--result-dir /workspace/ \
+--result-filename $RESULT_FILENAME.json

From 264186fb128e2c10e8ec8dadce41696560854060 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 18:49:34 -0500
Subject: [PATCH 130/149] testing concurrency


From e9e0e70d83383af48fede768ac4f7aa34ce5fd24 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 16:21:44 -0500
Subject: [PATCH 131/149] update random range ratio default

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index d1acf16c7..2eef0e18f 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -44,7 +44,7 @@ on:
       random-range-ratio:
         required: false
         type: string
-        default: '0.2'
+        default: '0.8'
 
 env:
   HF_TOKEN: ${{ secrets.HF_TOKEN }}

From bbc22209e03dca7a9acfe2f545503de003b723b8 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 16:31:06 -0500
Subject: [PATCH 132/149] get process results vals from env vars instead of
 argv

---
 .../workflows/benchmark-multinode-tmpl.yml    |  2 +-
 .github/workflows/benchmark-tmpl.yml          |  2 +-
 utils/process_result.py                       | 22 ++++++++++---------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index b4d917575..bfbd5a1cf 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -94,7 +94,7 @@ jobs:
               # Extract GPU count from filename for tp_size calculation
               gpus=$(echo "$result_file" | sed "s/.*_gpus\([0-9]*\)\.json/\1/")
               if [ -n "$gpus" ]; then
-                python3 utils/process_result.py ${{ inputs.runner }} $gpus 1 false ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE
+                TP=$gpus RESULT_FILENAME=${result_file%.json} EP_SIZE=1 DP_ATTENTION=false python3 utils/process_result.py
               fi
             fi
           done
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 2eef0e18f..754cbb969 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -139,7 +139,7 @@ jobs:
 
       - name: Process result
         run: |
-          python3 utils/process_result.py ${{ inputs.runner }} $TP $EP_SIZE $DP_ATTENTION $RESULT_FILENAME $FRAMEWORK $PRECISION
+          python3 utils/process_result.py ${{ inputs.runner }}
       - name: Upload result
         uses: actions/upload-artifact@v4
         with:
diff --git a/utils/process_result.py b/utils/process_result.py
index a59d1f7f3..d59a61790 100644
--- a/utils/process_result.py
+++ b/utils/process_result.py
@@ -1,15 +1,17 @@
 import sys
 import json
+import os
 from pathlib import Path
 
 
-hw = sys.argv[1]
-tp_size = int(sys.argv[2])
-ep_size = int(sys.argv[3])
-dp_attention = sys.argv[4]
-result_filename = sys.argv[5]
-framework = sys.argv[6]
-precision = sys.argv[7]
+hw = os.environ.get('RUNNER_NAME')
+tp_size = int(os.environ.get('TP'))
+ep_size = int(os.environ.get('EP_SIZE'))
+dp_attention = os.environ.get('DP_ATTENTION')
+result_filename = os.environ.get('RESULT_FILENAME')
+framework = os.environ.get('FRAMEWORK')
+precision = os.environ.get('PRECISION')
+mtp_mode = os.environ.get('MTP_MODE')
 
 with open(f'{result_filename}.json') as f:
     bmk_result = json.load(f)
@@ -18,8 +20,8 @@
     'hw': hw,
     'tp': tp_size,
     'ep': ep_size,
-    'conc': int(bmk_result['max_concurrency']),
     'dp_attention': dp_attention, # true or false
+    'conc': int(bmk_result['max_concurrency']),
     'model': bmk_result['model_id'],
     'framework': framework,
     'precision': precision,
@@ -27,8 +29,8 @@
     'output_tput_per_gpu': float(bmk_result['output_throughput']) / tp_size
 }
 
-if len(sys.argv) == 9:  # MTP
-    data['mtp'] = sys.argv[8]
+if mtp_mode:  # MTP
+    data['mtp'] = mtp_mode
 
 for key, value in bmk_result.items():
     if key.endswith('ms'):

From d5ec7dec14da7103feea08ca14601bf5975b79b4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 16:42:39 -0500
Subject: [PATCH 133/149] get process results vals from env vars instead of
 argv pt 2

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 754cbb969..293e3ac49 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -139,7 +139,7 @@ jobs:
 
       - name: Process result
         run: |
-          python3 utils/process_result.py ${{ inputs.runner }}
+          python3 utils/process_result.py
       - name: Upload result
         uses: actions/upload-artifact@v4
         with:

From 6af36effae2cad1e021c3600efb73822a868d744 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 17:57:15 -0500
Subject: [PATCH 134/149] editing runners yaml

---
 .github/README.md            | 8 ++++++++
 .github/configs/runners.yaml | 1 -
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/README.md b/.github/README.md
index f4539dd5d..69fc1069f 100644
--- a/.github/README.md
+++ b/.github/README.md
@@ -33,6 +33,8 @@ Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter
 test-config --key dsr1-fp4-b200-sglang --seq-len 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
 ```
 
+Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986046399
+
 If we wanted to also test 1k8k or 8k1k scenarios, we would simply append `1k8k` or `8k1k` to `--seq-len`, respectively.
 
 Further, if we wanted to run that config on *one specific* runner node, we could specify that by appending `--runner-node` to the argument list. Note that if the specified runner node is not compatible with the specified config key (as dictated by `.github/configs/runners.yaml`), then the workflow will error:
@@ -43,6 +45,8 @@ test-config --config-files .github/configs/nvidia-master.yaml --runner-config .g
 ValueError: Runner node 'mi300x-amd_0' is not compatible with config 'dsr1-fp4-b200-sglang' which runs on runner type 'b200'. Available runner nodes for this config are 'b200-nb_0, b200-nb_1, b200-nvd_0, b200-nvd_1, b200-nvd_2, b200-nvd_3, b200-tg_0'.
 ```
 
+Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986053019/job/54229839736
+
 **Scenario 2**: I just made a change to the `benchmarks/dsr1_fp8_b200_docker.sh` and I need to verify that these changes work across all B200 runners.
 
 Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the text following command as the text input:
@@ -50,6 +54,8 @@ Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter
 runner-sweep --runner-type b200 --model-prefix dsr1 --precision fp8 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
 ```
 
+Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986283169
+
 This will run a test (just the highest available parallelism and lowest available concurrency) for each B200 runner node for each Deepseek config that runs on B200 with fp8 precision. I.e., this can be used to "sweep" across runners for a particular model to test that all runners still work with changes that have been made.
 
 **Scenario 3**: I just upgraded the CUDA drivers on all H200 runners and need to verify that all models that use H200 still work correctly across all H200 nodes.
@@ -59,6 +65,8 @@ Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter
 runner-model-sweep --runner-type h200 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml
 ```
 
+Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986292917
+
 This will run a test (just the highest available parallelism and lowest available concurrency) for each configuration that specifies the `h200` runner type, across all H200 runner nodes defined in `.github/configs/runners.yaml`.
 
 For example, if you have configs `dsr1-fp8-h200-sglang`, `dsr1-fp8-h200-trt`, and `gptoss-fp4-h200-vllm` that all use `runner: h200`, and you have 8 H200 nodes (`h200-cw_0`, `h200-cw_1`, etc.), this will run all 3 configs on all 8 nodes (24 total test runs).
diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 692ade8dd..cdd865561 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -38,7 +38,6 @@ b200:
 - 'b200-nvd_1'
 - 'b200-nvd_2'
 - 'b200-nvd_3'
-- 'b200-tg_0'
 mi300x:
 - 'mi300x-amd_0'
 - 'mi300x-amd_1'

From cefcf15268b6ade1fc275e89854a3ce4b95e0602 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 18:49:34 -0500
Subject: [PATCH 135/149] testing concurrency


From 46545a910a5befb8869ca57759b7b0f7e467bf84 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Mon, 27 Oct 2025 14:42:41 -0500
Subject: [PATCH 136/149] adding more workflows

---
 .github/workflows/1k1k-sweep.yml |   6 +-
 .github/workflows/1k8k-sweep.yml |   6 +-
 .github/workflows/test.yml       | 147 +++++++++++++++++++++++++++++++
 3 files changed, 155 insertions(+), 4 deletions(-)
 create mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index 0930f8a9a..e806f4c70 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -1,4 +1,8 @@
-name: "Full Sweep Scheduler - 1k1k"
+name: "1K/1K Sweep"
+
+concurrency:
+  group: benchmark-lock-1k1k
+  cancel-in-progress: false
 
 on:
     workflow_dispatch:
diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index c3bcf9662..f4bb4338e 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -1,9 +1,9 @@
 name: "Full Sweep Scheduler - 1k8k"
 
 on:
-    workflow_dispatch:
-    schedule:
-        - cron: "0 23 * * *"
+  workflow_dispatch:
+  schedule:
+    - cron: '0 23 * * *'
 
 jobs:
     get-dsr1-configs:
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
new file mode 100644
index 000000000..0d92952da
--- /dev/null
+++ b/.github/workflows/test.yml
@@ -0,0 +1,147 @@
+name: Test - Full Sweep
+
+concurrency:
+    group: benchmark-lock
+    cancel-in-progress: false
+
+on:
+    pull_request:
+    workflow_dispatch:
+        inputs:
+            name:
+                description: "Name of benchmark from master configs"
+                required: true
+                type: string
+                default: 70b-fp4-mi355x-vllm
+
+            run_1k1k:
+                description: "Run ISL/OSL 1k/1k"
+                type: boolean
+                required: true
+            run_1k8k:
+                description: "Run ISL/OSL 1k/8k"
+                type: boolean
+                required: true
+            run_8k1k:
+                description: "Run ISL/OSL 8k/1k"
+                type: boolean
+                required: true
+
+            runner:
+                description: "Specific runner node to run on"
+                required: false
+                type: choice
+                options:
+                    - "h100-cr_0"
+                    - "h100-cr_1"
+                    - "h100-cw_0"
+                    - "h100-cw_1"
+                    - "h200-cw_0"
+                    - "h200-cw_1"
+                    - "h200-nb_0"
+                    - "h200-nb_1"
+                    - "h200-nb_2"
+                    - "h200-nb_3"
+                    - "h200-nv_0"
+                    - "h200-nv_1"
+                    - "h200-nv_2"
+                    - "h200-nv_3"
+                    - "b200-nv_0"
+                    - "b200-nv_1"
+                    - "b200-nb_0"
+                    - "b200-nb_1"
+                    - "b200-nvd_0"
+                    - "b200-nvd_1"
+                    - "b200-nvd_2"
+                    - "b200-nvd_3"
+                    - "b200-tg_0"
+                    - "mi300x-amd_0"
+                    - "mi300x-amd_1"
+                    - "mi300x-amd_2"
+                    - "mi300x-amd_3"
+                    - "mi300x-amd_4"
+                    - "mi300x-cr_0"
+                    - "mi300x-oci_0"
+                    - "mi325x-amd_0"
+                    - "mi325x-tw_0"
+                    - "mi325x-tw_1"
+                    - "mi325x-tw_2"
+                    - "mi325x-tw_3"
+                    - "mi355x-amd_0"
+                    - "mi355x-amd_1"
+                    - "mi355x-amd_2"
+                    - "mi355x-amd_3"
+
+jobs:
+    get-jobs:
+        runs-on: ubuntu-latest
+        outputs:
+            search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
+        steps:
+            - name: Checkout code
+              uses: actions/checkout@v4
+
+            - id: get-jobs
+              run: |
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \
+                    --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \
+                    --key ${{ inputs.name }} \
+                    ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }})
+                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
+
+    test-sweep:
+        needs: get-jobs
+        uses: ./.github/workflows/benchmark-tmpl.yml
+        name: test sweep - ${{ inputs.name }}
+        strategy:
+            fail-fast: false
+            matrix:
+                config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
+        secrets: inherit
+        with:
+            exp-name: "dsr1_1k1k"
+            isl: ${{ matrix.config.isl }}
+            osl: ${{ matrix.config.osl }}
+            max-model-len: ${{ matrix.config.max-model-len }}
+            runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }}
+            image: ${{ matrix.config.image }}
+            model: ${{ matrix.config.model }}
+            framework: ${{ matrix.config.framework }}
+            precision: ${{ matrix.config.precision }}
+            tp: ${{ matrix.config.tp }}
+            ep: ${{ matrix.config.ep || 1 }}
+            dp-attn: ${{ matrix.config.dp-attn || false }}
+            conc: ${{ matrix.config.conc }}
+
+    calc-success-rate:
+        needs: test-sweep
+        if: ${{ always() }}
+        runs-on: ubuntu-latest
+
+        env:
+            RESULTS_DIR: "results/"
+            STATS_FILENAME: "run_stats"
+            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
+
+        steps:
+            - uses: actions/checkout@v3
+              with:
+                  token: ${{ secrets.REPO_PAT }}
+                  fetch-depth: 0
+
+            - name: Download results artifacts
+              uses: actions/download-artifact@v4
+              with:
+                  path: ${{ env.RESULTS_DIR }}
+                  pattern: results_*
+
+            - name: Install python dependencies
+              run: pip install PyGithub
+
+            - name: Calculate success rate
+              run: python3 utils/calc_success_rate.py $STATS_FILENAME
+
+            - uses: actions/upload-artifact@v4
+              with:
+                  name: "run-stats"
+                  path: ${{ env.STATS_FILENAME }}.json

From e59f2d712648f91434fcc99cc14c47ba3c7711f6 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Wed, 29 Oct 2025 17:05:27 -0500
Subject: [PATCH 137/149] deleting files

---
 .github/workflows/test.yml | 147 -------------------------------------
 1 file changed, 147 deletions(-)
 delete mode 100644 .github/workflows/test.yml

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
deleted file mode 100644
index 0d92952da..000000000
--- a/.github/workflows/test.yml
+++ /dev/null
@@ -1,147 +0,0 @@
-name: Test - Full Sweep
-
-concurrency:
-    group: benchmark-lock
-    cancel-in-progress: false
-
-on:
-    pull_request:
-    workflow_dispatch:
-        inputs:
-            name:
-                description: "Name of benchmark from master configs"
-                required: true
-                type: string
-                default: 70b-fp4-mi355x-vllm
-
-            run_1k1k:
-                description: "Run ISL/OSL 1k/1k"
-                type: boolean
-                required: true
-            run_1k8k:
-                description: "Run ISL/OSL 1k/8k"
-                type: boolean
-                required: true
-            run_8k1k:
-                description: "Run ISL/OSL 8k/1k"
-                type: boolean
-                required: true
-
-            runner:
-                description: "Specific runner node to run on"
-                required: false
-                type: choice
-                options:
-                    - "h100-cr_0"
-                    - "h100-cr_1"
-                    - "h100-cw_0"
-                    - "h100-cw_1"
-                    - "h200-cw_0"
-                    - "h200-cw_1"
-                    - "h200-nb_0"
-                    - "h200-nb_1"
-                    - "h200-nb_2"
-                    - "h200-nb_3"
-                    - "h200-nv_0"
-                    - "h200-nv_1"
-                    - "h200-nv_2"
-                    - "h200-nv_3"
-                    - "b200-nv_0"
-                    - "b200-nv_1"
-                    - "b200-nb_0"
-                    - "b200-nb_1"
-                    - "b200-nvd_0"
-                    - "b200-nvd_1"
-                    - "b200-nvd_2"
-                    - "b200-nvd_3"
-                    - "b200-tg_0"
-                    - "mi300x-amd_0"
-                    - "mi300x-amd_1"
-                    - "mi300x-amd_2"
-                    - "mi300x-amd_3"
-                    - "mi300x-amd_4"
-                    - "mi300x-cr_0"
-                    - "mi300x-oci_0"
-                    - "mi325x-amd_0"
-                    - "mi325x-tw_0"
-                    - "mi325x-tw_1"
-                    - "mi325x-tw_2"
-                    - "mi325x-tw_3"
-                    - "mi355x-amd_0"
-                    - "mi355x-amd_1"
-                    - "mi355x-amd_2"
-                    - "mi355x-amd_3"
-
-jobs:
-    get-jobs:
-        runs-on: ubuntu-latest
-        outputs:
-            search-space-config: ${{ steps.get-jobs.outputs.search-space-config }}
-        steps:
-            - name: Checkout code
-              uses: actions/checkout@v4
-
-            - id: get-jobs
-              run: |
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_test_sweep_configs.py \
-                    --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml \
-                    --key ${{ inputs.name }} \
-                    ${{ (inputs.run_1k1k || inputs.run_1k8k || inputs.run_8k1k) && format('--seq-lens{0}{1}{2}', inputs.run_1k1k && ' 1k1k' || '', inputs.run_1k8k && ' 1k8k' || '', inputs.run_8k1k && ' 8k1k' || '') || '' }})
-                  echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
-
-    test-sweep:
-        needs: get-jobs
-        uses: ./.github/workflows/benchmark-tmpl.yml
-        name: test sweep - ${{ inputs.name }}
-        strategy:
-            fail-fast: false
-            matrix:
-                config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }}
-        secrets: inherit
-        with:
-            exp-name: "dsr1_1k1k"
-            isl: ${{ matrix.config.isl }}
-            osl: ${{ matrix.config.osl }}
-            max-model-len: ${{ matrix.config.max-model-len }}
-            runner: ${{ inputs.runner != '' && inputs.runner || matrix.config.runner }}
-            image: ${{ matrix.config.image }}
-            model: ${{ matrix.config.model }}
-            framework: ${{ matrix.config.framework }}
-            precision: ${{ matrix.config.precision }}
-            tp: ${{ matrix.config.tp }}
-            ep: ${{ matrix.config.ep || 1 }}
-            dp-attn: ${{ matrix.config.dp-attn || false }}
-            conc: ${{ matrix.config.conc }}
-
-    calc-success-rate:
-        needs: test-sweep
-        if: ${{ always() }}
-        runs-on: ubuntu-latest
-
-        env:
-            RESULTS_DIR: "results/"
-            STATS_FILENAME: "run_stats"
-            GITHUB_TOKEN: ${{ secrets.REPO_PAT }}
-
-        steps:
-            - uses: actions/checkout@v3
-              with:
-                  token: ${{ secrets.REPO_PAT }}
-                  fetch-depth: 0
-
-            - name: Download results artifacts
-              uses: actions/download-artifact@v4
-              with:
-                  path: ${{ env.RESULTS_DIR }}
-                  pattern: results_*
-
-            - name: Install python dependencies
-              run: pip install PyGithub
-
-            - name: Calculate success rate
-              run: python3 utils/calc_success_rate.py $STATS_FILENAME
-
-            - uses: actions/upload-artifact@v4
-              with:
-                  name: "run-stats"
-                  path: ${{ env.STATS_FILENAME }}.json

From fe445a1a9dfcc5d1cb7cca8505ba3e50f8b4f766 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 18:49:34 -0500
Subject: [PATCH 138/149] testing concurrency


From d1540496a2eaed5c218a1582b579230f80277861 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 18:49:34 -0500
Subject: [PATCH 139/149] testing concurrency


From 880d3c8276eb8ce839776d2876f5fa43b85c7aae Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Sun, 26 Oct 2025 18:49:34 -0500
Subject: [PATCH 140/149] testing concurrency


From 026d16b82a25399f3c68cfa49b10c66ba3f9566c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 18:46:45 -0500
Subject: [PATCH 141/149] remove 70b

---
 benchmarks/70b_fp4_b200_trt_slurm.sh | 75 ------------------------
 benchmarks/70b_fp8_b200_trt_slurm.sh | 75 ------------------------
 benchmarks/70b_fp8_h200_slurm.sh     | 69 ----------------------
 benchmarks/70b_fp8_h200_trt_slurm.sh | 70 ----------------------
 benchmarks/70b_fp8_mi325x_slurm.sh   | 86 ----------------------------
 5 files changed, 375 deletions(-)
 delete mode 100644 benchmarks/70b_fp4_b200_trt_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_b200_trt_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_h200_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_h200_trt_slurm.sh
 delete mode 100644 benchmarks/70b_fp8_mi325x_slurm.sh

diff --git a/benchmarks/70b_fp4_b200_trt_slurm.sh b/benchmarks/70b_fp4_b200_trt_slurm.sh
deleted file mode 100644
index ad24453b3..000000000
--- a/benchmarks/70b_fp4_b200_trt_slurm.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-hf download $MODEL
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-
-set -x
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-# Launch TRT-LLM server
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_b200_trt_slurm.sh b/benchmarks/70b_fp8_b200_trt_slurm.sh
deleted file mode 100644
index ad24453b3..000000000
--- a/benchmarks/70b_fp8_b200_trt_slurm.sh
+++ /dev/null
@@ -1,75 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-hf download $MODEL
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-
-set -x
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-# Launch TRT-LLM server
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_slurm.sh b/benchmarks/70b_fp8_h200_slurm.sh
deleted file mode 100644
index 094fbd19c..000000000
--- a/benchmarks/70b_fp8_h200_slurm.sh
+++ /dev/null
@@ -1,69 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-set -x
-hf download $MODEL
-pip install datasets pandas
-
-# Calculate max-model-len based on ISL and OSL
-if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then
-    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 20))
-elif [ "$ISL" = "8192" ] || [ "$OSL" = "8192" ]; then
-    CALCULATED_MAX_MODEL_LEN=$((ISL + OSL + 200))
-else
-    CALCULATED_MAX_MODEL_LEN=${MAX_MODEL_LEN:-10240}  
-fi
-
-# Create config.yaml
-cat > config.yaml << EOF
-kv-cache-dtype: fp8
-async-scheduling: true
-no-enable-prefix-caching: true
-max-num-batched-tokens: 8192
-max-model-len: $CALCULATED_MAX_MODEL_LEN
-EOF
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-export TORCH_CUDA_ARCH_LIST="9.0"
-
-PYTHONNOUSERSITE=1 vllm serve $MODEL --host 0.0.0.0 --port $PORT --config config.yaml \
- --gpu-memory-utilization 0.9 --tensor-parallel-size $TP --max-num-seqs $CONC  \
- --disable-log-requests > $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
diff --git a/benchmarks/70b_fp8_h200_trt_slurm.sh b/benchmarks/70b_fp8_h200_trt_slurm.sh
deleted file mode 100644
index dfb2324b9..000000000
--- a/benchmarks/70b_fp8_h200_trt_slurm.sh
+++ /dev/null
@@ -1,70 +0,0 @@
-#!/usr/bin/env bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-hf download $MODEL
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-# Create llama-config.yml inline
-# For 1k/1k, use batch_wait_max_tokens_ratio and batch_wait_timeout_iters will improve the performance, by default they are all zeros
-if [[ "$ISL" == "1024" && "$OSL" == "1024" && ${TP} -lt 8 ]]; then
-cat > llama-config.yml << 'EOF'
-batch_wait_max_tokens_ratio: 0.9
-batch_wait_timeout_iters: 20
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-else 
-cat > llama-config.yml << 'EOF'
-cuda_graph_config: 
-  enable_padding: true 
-  max_batch_size: 1024 
-kv_cache_config: 
-  dtype: fp8 
-  enable_block_reuse: false 
-stream_interval: 10
-EOF
-fi
-
-mpirun -n 1 --oversubscribe --allow-run-as-root trtllm-serve $MODEL --tp_size $TP --trust_remote_code --max_seq_len $MAX_MODEL_LEN --max_num_tokens 16384 --extra_llm_api_options llama-config.yml --port $PORT > $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend openai \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json
\ No newline at end of file
diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh
deleted file mode 100644
index 1febeff13..000000000
--- a/benchmarks/70b_fp8_mi325x_slurm.sh
+++ /dev/null
@@ -1,86 +0,0 @@
-#!/usr/bin/bash
-
-# === Required Env Vars === 
-# HF_TOKEN
-# HF_HUB_CACHE
-# IMAGE
-# MODEL
-# ISL
-# OSL
-# MAX_MODEL_LEN
-# RANDOM_RANGE_RATIO
-# TP
-# CONC
-# RESULT_FILENAME
-# PORT_OFFSET
-
-echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-
-huggingface-cli download $MODEL
-
-SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
-PORT=$(( 8888 + $PORT_OFFSET ))
-
-# Reference
-# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
-
-cat > config.yaml << EOF
-compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
-EOF
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    export VLLM_ROCM_USE_AITER_MHA=0
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-	if [[ "$CONC" -ge "16" ]]; then
-		export VLLM_ROCM_USE_AITER_MHA=1
-    else
-		export VLLM_ROCM_USE_AITER_MHA=0
-	fi
-fi
-
-# Patch the aiter config script to deal
-# with weird strings reported by /opt/rocm/llvm/bin/amdgpu-arch.
-file_to_patch='/opt/venv/lib/python3.10/site-packages/aiter_meta/csrc/cpp_itfs/utils.py'
-sed -i'' -e 's#archs = \[arch.strip() for arch in archs\]#archs = \[arch.strip().split(":")\[0\] for arch in archs\]#'  $file_to_patch
-
-
-# In this specific case, float16 performs better than the datatype
-# picked by vllm when using auto for --dtype (bfloat16).
-set -x
-vllm serve $MODEL --port=$PORT \
---swap-space=64 \
---gpu-memory-utilization=0.94 \
---dtype=float16 --kv-cache-dtype=fp8 \
---distributed-executor-backend=mp --tensor-parallel-size=$TP \
---max-model-len=$MAX_MODEL_LEN \
---max-seq-len-to-capture=$MAX_MODEL_LEN \
---max-num-seqs=$CONC \
---max-num-batched-tokens=131072 \
---no-enable-prefix-caching \
---config config.yaml \
---async-scheduling \
---disable-log-requests \
-> $SERVER_LOG 2>&1 &
-
-set +x
-while IFS= read -r line; do
-    printf '%s\n' "$line"
-    if [[ "$line" == *"Application startup complete"* ]]; then
-        break
-    fi
-done < <(tail -F -n0 "$SERVER_LOG")
-
-set -x
-git clone https://github.com/kimbochen/bench_serving.git
-python3 bench_serving/benchmark_serving.py \
---model $MODEL --backend vllm \
---base-url http://0.0.0.0:$PORT \
---dataset-name random \
---random-input-len $ISL --random-output-len $OSL --random-range-ratio $RANDOM_RANGE_RATIO \
---num-prompts $(( $CONC * 10 )) --max-concurrency $CONC \
---request-rate inf --ignore-eos \
---save-result --percentile-metrics 'ttft,tpot,itl,e2el' \
---result-dir /workspace/ \
---result-filename $RESULT_FILENAME.json

From 4a81cd4c5619deacd89df54476af5b07cddca18d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 18:51:12 -0500
Subject: [PATCH 142/149] cleaning up after rebase

---
 .github/workflows/1k1k-sweep.yml | 6 +-----
 .github/workflows/1k8k-sweep.yml | 7 ++++---
 .github/workflows/8k1k-sweep.yml | 1 +
 3 files changed, 6 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/1k1k-sweep.yml
index e806f4c70..0930f8a9a 100644
--- a/.github/workflows/1k1k-sweep.yml
+++ b/.github/workflows/1k1k-sweep.yml
@@ -1,8 +1,4 @@
-name: "1K/1K Sweep"
-
-concurrency:
-  group: benchmark-lock-1k1k
-  cancel-in-progress: false
+name: "Full Sweep Scheduler - 1k1k"
 
 on:
     workflow_dispatch:
diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/1k8k-sweep.yml
index f4bb4338e..9dacb5a9f 100644
--- a/.github/workflows/1k8k-sweep.yml
+++ b/.github/workflows/1k8k-sweep.yml
@@ -1,9 +1,9 @@
 name: "Full Sweep Scheduler - 1k8k"
 
 on:
-  workflow_dispatch:
-  schedule:
-    - cron: '0 23 * * *'
+    workflow_dispatch:
+    schedule:
+        - cron: "0 23 * * *"
 
 jobs:
     get-dsr1-configs:
@@ -16,6 +16,7 @@ jobs:
 
             - id: get-dsr1-configs
               run: |
+                  pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/8k1k-sweep.yml
index fdb6b6112..3a0ae47c3 100644
--- a/.github/workflows/8k1k-sweep.yml
+++ b/.github/workflows/8k1k-sweep.yml
@@ -16,6 +16,7 @@ jobs:
 
             - id: get-dsr1-configs
               run: |
+                  pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 

From cac35bc5de887b2097b0f9fb3c0e8fdabb4c2b4b Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 31 Oct 2025 18:54:05 -0500
Subject: [PATCH 143/149] changing name of files from XkYk to shceduler

---
 .../workflows/{1k1k-sweep.yml => full-sweep-1k1k-scheduler.yml}   | 0
 .../workflows/{1k8k-sweep.yml => full-sweep-1k8k-scheduler.yml}   | 0
 .../workflows/{8k1k-sweep.yml => full-sweep-8k1k-scheduler.yml}   | 0
 3 files changed, 0 insertions(+), 0 deletions(-)
 rename .github/workflows/{1k1k-sweep.yml => full-sweep-1k1k-scheduler.yml} (100%)
 rename .github/workflows/{1k8k-sweep.yml => full-sweep-1k8k-scheduler.yml} (100%)
 rename .github/workflows/{8k1k-sweep.yml => full-sweep-8k1k-scheduler.yml} (100%)

diff --git a/.github/workflows/1k1k-sweep.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml
similarity index 100%
rename from .github/workflows/1k1k-sweep.yml
rename to .github/workflows/full-sweep-1k1k-scheduler.yml
diff --git a/.github/workflows/1k8k-sweep.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml
similarity index 100%
rename from .github/workflows/1k8k-sweep.yml
rename to .github/workflows/full-sweep-1k8k-scheduler.yml
diff --git a/.github/workflows/8k1k-sweep.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml
similarity index 100%
rename from .github/workflows/8k1k-sweep.yml
rename to .github/workflows/full-sweep-8k1k-scheduler.yml

From b60289e52d67d54e656369e5bb44aa3c1ea3f963 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 3 Nov 2025 09:39:03 -0600
Subject: [PATCH 144/149] double check and update master configs

---
 .github/configs/nvidia-master.yaml | 39 +++++++++++++++---------------
 1 file changed, 20 insertions(+), 19 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 92dfb5bbd..e9af1ce19 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -188,7 +188,7 @@ gptoss-fp4-b200-trt:
     - { tp: 1, conc-start: 64, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 8 }
+    - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 8192
     osl: 1024
     search-space:
@@ -208,14 +208,14 @@ gptoss-fp4-b200-vllm:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 1, conc-start: 4, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 8 }
   - isl: 1024
     osl: 8192
     search-space:
-    - { tp: 1, conc-start: 64, conc-end: 64 }
+    - { tp: 1, conc-start: 4, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 8 }
@@ -225,7 +225,7 @@ gptoss-fp4-b200-vllm:
     - { tp: 1, conc-start: 4, conc-end: 64 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 8, conc-start: 4, conc-end: 4 }
 
 gptoss-fp4-h100-vllm:
   image: vllm/vllm-openai:v0.10.2
@@ -252,7 +252,7 @@ gptoss-fp4-h100-vllm:
     search-space:
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, conc-start: 4, conc-end: 32 }
+    - { tp: 8, conc-start: 4, conc-end: 16 }
 
 gptoss-fp4-h200-trt:
   image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev
@@ -261,28 +261,29 @@ gptoss-fp4-h200-trt:
   runner: h200-trt
   precision: fp4
   framework: trt
+  # For all sequence lengths, EP=TP, DP_ATTENTION=false
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 8 }
+    - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
   - isl: 1024
     osl: 8192
     search-space:
-    - { tp: 1, ep: 1, conc-start: 32, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 1, ep: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 2, ep: 2, conc-start: 4, conc-end: 64 }
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 64 }
-    - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 }
+    - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 }
+    - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 }
+    - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 }
+    - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 }
 
 gptoss-fp4-h200-vllm:
   image: vllm/vllm-openai:v0.10.2
@@ -295,14 +296,14 @@ gptoss-fp4-h200-vllm:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 1, conc-start: 4, conc-end: 64 }
+    - { tp: 1, conc-start: 4, conc-end: 4 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }
   - isl: 1024
     osl: 8192
     search-space:
-    - { tp: 1, conc-start: 4, conc-end: 16 }
+    - { tp: 1, conc-start: 4, conc-end: 4 }
     - { tp: 2, conc-start: 4, conc-end: 64 }
     - { tp: 4, conc-start: 4, conc-end: 64 }
     - { tp: 8, conc-start: 4, conc-end: 64 }

From 9fba14ae5a9137dd797cdce18ae68d50a51ace27 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 3 Nov 2025 09:44:57 -0600
Subject: [PATCH 145/149] double check and update master configs pt 2

---
 .github/configs/amd-master.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index d9558f284..82251c8be 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -155,17 +155,17 @@ gptoss-fp4-mi355x-vllm:
     osl: 1024
     search-space:
     - { tp: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 16 }
+    - { tp: 4, conc-start: 4, conc-end: 8 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 1024
     osl: 8192
     search-space:
     - { tp: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 16 }
+    - { tp: 4, conc-start: 4, conc-end: 8 }
     - { tp: 8, conc-start: 4, conc-end: 16 }
   - isl: 8192
     osl: 1024
     search-space:
     - { tp: 1, conc-start: 4, conc-end: 64 }
-    - { tp: 4, conc-start: 4, conc-end: 16 }
-    - { tp: 8, conc-start: 4, conc-end: 16 }
+    - { tp: 4, conc-start: 4, conc-end: 4 }
+    - { tp: 8, conc-start: 4, conc-end: 8 }

From c33187411db4335070ae11c0b19ae8111c56832a Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 3 Nov 2025 09:46:16 -0600
Subject: [PATCH 146/149] add pydantic pip install

---
 .github/workflows/full-sweep-1k8k-scheduler.yml | 1 +
 .github/workflows/full-sweep-8k1k-scheduler.yml | 1 +
 2 files changed, 2 insertions(+)

diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml
index 9dacb5a9f..a8ee10d00 100644
--- a/.github/workflows/full-sweep-1k8k-scheduler.yml
+++ b/.github/workflows/full-sweep-1k8k-scheduler.yml
@@ -30,6 +30,7 @@ jobs:
 
             - id: get-gptoss-configs
               run: |
+                  pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml
index 3a0ae47c3..cd9cd0531 100644
--- a/.github/workflows/full-sweep-8k1k-scheduler.yml
+++ b/.github/workflows/full-sweep-8k1k-scheduler.yml
@@ -30,6 +30,7 @@ jobs:
 
             - id: get-gptoss-configs
               run: |
+                  pip install pydantic
                   CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 

From 582e1b1702a91ba274b753779833b7b1838eff11 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 3 Nov 2025 11:19:54 -0600
Subject: [PATCH 147/149] bug fix

---
 .github/workflows/full-sweep-1k8k-scheduler.yml | 6 +++---
 .github/workflows/full-sweep-8k1k-scheduler.yml | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml
index a8ee10d00..4d7e5cc22 100644
--- a/.github/workflows/full-sweep-1k8k-scheduler.yml
+++ b/.github/workflows/full-sweep-1k8k-scheduler.yml
@@ -17,7 +17,7 @@ jobs:
             - id: get-dsr1-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     get-gptoss-configs:
@@ -31,7 +31,7 @@ jobs:
             - id: get-gptoss-configs
               run: |
                   pip install pydantic
-                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/get_full_sweep_configs.py --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
+                  CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss)
                   echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT
 
     benchmark-dsr1:
@@ -85,7 +85,7 @@ jobs:
     # This is a workaround until we can integrate GB200 into master configs.
     benchmark-gb200:
         uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-        name: gb200 1k1k sweep
+        name: gb200 1k8k sweep
         strategy:
             fail-fast: false
             matrix:
diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml
index cd9cd0531..a4a492178 100644
--- a/.github/workflows/full-sweep-8k1k-scheduler.yml
+++ b/.github/workflows/full-sweep-8k1k-scheduler.yml
@@ -85,7 +85,7 @@ jobs:
     # This is a workaround until we can integrate GB200 into master configs.
     benchmark-gb200:
         uses: ./.github/workflows/benchmark-multinode-tmpl.yml
-        name: gb200 1k1k sweep
+        name: gb200 8k1k sweep
         strategy:
             fail-fast: false
             matrix:

From 4b78c4abbff4fcda58247158ab0fb2f236cd7c57 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 3 Nov 2025 11:23:26 -0600
Subject: [PATCH 148/149] update cron trigger to 9:00 PM CDT

---
 .github/workflows/full-sweep-1k1k-scheduler.yml | 2 +-
 .github/workflows/full-sweep-1k8k-scheduler.yml | 2 +-
 .github/workflows/full-sweep-8k1k-scheduler.yml | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml
index 0930f8a9a..6e2128218 100644
--- a/.github/workflows/full-sweep-1k1k-scheduler.yml
+++ b/.github/workflows/full-sweep-1k1k-scheduler.yml
@@ -3,7 +3,7 @@ name: "Full Sweep Scheduler - 1k1k"
 on:
     workflow_dispatch:
     schedule:
-        - cron: "0 23 * * *"
+        - cron: "0 3 * * *"
 
 jobs:
     get-dsr1-configs:
diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml
index 4d7e5cc22..b8437969e 100644
--- a/.github/workflows/full-sweep-1k8k-scheduler.yml
+++ b/.github/workflows/full-sweep-1k8k-scheduler.yml
@@ -3,7 +3,7 @@ name: "Full Sweep Scheduler - 1k8k"
 on:
     workflow_dispatch:
     schedule:
-        - cron: "0 23 * * *"
+        - cron: "0 3 * * *"
 
 jobs:
     get-dsr1-configs:
diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml
index a4a492178..bc3cd07dc 100644
--- a/.github/workflows/full-sweep-8k1k-scheduler.yml
+++ b/.github/workflows/full-sweep-8k1k-scheduler.yml
@@ -3,7 +3,7 @@ name: "Full Sweep Scheduler - 8k1k"
 on:
     workflow_dispatch:
     schedule:
-        - cron: "0 23 * * *"
+        - cron: "0 3 * * *"
 
 jobs:
     get-dsr1-configs:

From 7c4c931a0660a760c9f2a9020737285eae1b4907 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Mon, 3 Nov 2025 12:02:08 -0600
Subject: [PATCH 149/149] runner name bug in process result python script

---
 .github/workflows/benchmark-multinode-tmpl.yml | 2 ++
 .github/workflows/benchmark-tmpl.yml           | 2 ++
 utils/process_result.py                        | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index bfbd5a1cf..4b079f578 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -86,6 +86,8 @@ jobs:
           fi
 
       - name: Process results
+        env:
+          RUNNER_TYPE: ${{ inputs.runner }}
         run: |
           # Process each result file
           for result_file in ${RESULT_FILENAME}_*.json; do
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 293e3ac49..8d041bc73 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -138,6 +138,8 @@ jobs:
           fi
 
       - name: Process result
+        env:
+          RUNNER_TYPE: ${{ inputs.runner }}
         run: |
           python3 utils/process_result.py
       - name: Upload result
diff --git a/utils/process_result.py b/utils/process_result.py
index d59a61790..94ca30f24 100644
--- a/utils/process_result.py
+++ b/utils/process_result.py
@@ -4,7 +4,7 @@
 from pathlib import Path
 
 
-hw = os.environ.get('RUNNER_NAME')
+hw = os.environ.get('RUNNER_TYPE')
 tp_size = int(os.environ.get('TP'))
 ep_size = int(os.environ.get('EP_SIZE'))
 dp_attention = os.environ.get('DP_ATTENTION')