From 35bc387fffe898ea47d190c2f3ebd8c06d568d96 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 16 Oct 2025 19:31:22 -0500
Subject: [PATCH 01/17] adding static config defining parallelism search spaces
 for all runs

---
 .github/configs/search-space.yml        | 382 ++++++++++++++++++++++++
 .github/workflows/70b-tmpl.yml          |  16 +-
 .github/workflows/benchmark-tmpl.yml    |  36 ++-
 .github/workflows/dsr1-tmpl.yml         |  13 -
 .github/workflows/gptoss-tmpl.yml       |   8 -
 benchmarks/dsr1_fp4_b200_trt_slurm.sh   | 122 ++++----
 benchmarks/dsr1_fp8_b200_trt_slurm.sh   |  46 +--
 benchmarks/dsr1_fp8_h200_trt_slurm.sh   |  46 +--
 benchmarks/gptoss_fp4_b200_trt_slurm.sh |  50 ++--
 utils/flatten_matrix.py                 |  73 +++++
 10 files changed, 629 insertions(+), 163 deletions(-)
 create mode 100644 .github/configs/search-space.yml
 create mode 100644 utils/flatten_matrix.py

diff --git a/.github/configs/search-space.yml b/.github/configs/search-space.yml
new file mode 100644
index 000000000..7c1564909
--- /dev/null
+++ b/.github/configs/search-space.yml
@@ -0,0 +1,382 @@
+gptoss:
+  fp4:
+    h100:
+      1k1k:
+      - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: [ 8 ], conc: { start: 4, end: 32 } }
+    h200:
+      1k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: 2, conc: { start: 4, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 1, conc: { start: 4, end: 16 } }
+      - { tp: 2, conc: { start: 4, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: [ 1, 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+    h200-trt:
+      1k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: 2, conc: { start: 4, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 32 } }
+      - { tp: 8, conc: { start: 4, end: 8 } }
+      1k8k:
+      - { tp: 1, conc: { start: 32, end: 64 } }
+      - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } }
+    b200:
+      1k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 8 } }
+      1k8k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 8 } }
+      8k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } }
+    b200-trt:
+      1k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: 2, conc: { start: 4, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 8 } }
+      - { tp: 4, ep: 4, conc: { start: 16, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 8 } }
+      1k8k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: 2, conc: { start: 4, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 8 } }
+      - { tp: 4, ep: 4, conc: { start: 16, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 8 } }
+      8k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 8 } }
+    mi300x:
+      1k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      1k8k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      8k1k:
+      - { tp: [ 1, 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+    mi325x:
+      1k1k:
+      - { tp: [ 1, 2, 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 1, 4 ], conc: { start: 64, end: 64 } }
+      - { tp: [ 2, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 8 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+    mi355x:
+      1k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 16 } }
+      1k8k:
+      - { tp: [ 1, 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 16 } }
+
+llama:
+  fp4:
+    b200:
+      1k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: 2, conc: { start: 32, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      1k8k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: 2, conc: { start: 32, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      8k1k:
+      - { tp: 1, conc: { start: 16, end: 64 } }
+      - { tp: 2, conc: { start: 16, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 32 } }
+      - { tp: 8, conc: { start: 4, end: 8 } }
+    b200-trt:
+      1k1k:
+      - { tp: 1, conc: { start: 128, end: 128 } }
+      - { tp: 2, conc: { start: 64, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      1k8k:
+      - { tp: 1, conc: { start: 128, end: 128 } }
+      - { tp: 2, conc: { start: 64, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      8k1k:
+      - { tp: 1, conc: { start: 32, end: 128 } }
+      - { tp: 2, conc: { start: 16, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+    mi355x:
+      1k1k:
+      - { tp: 1, conc: { start: 32, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      1k8k:
+      - { tp: 1, conc: { start: 32, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      8k1k:
+      - { tp: 1, conc: { start: 32, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+  fp8:
+    h100:
+      1k1k:
+      - { tp: 2, conc: { start: 64, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 2, conc: { start: 64, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 2, conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+    h200:
+      1k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: 2, conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 1, 2 ], conc: { start: 64, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: [ 1, 2 ], conc: { start: 16, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+    h200-trt:
+      1k1k:
+      - { tp: 1, conc: { start: 128, end: 128 } }
+      - { tp: 2, conc: { start: 64, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      1k8k:
+      - { tp: 1, conc: { start: 128, end: 128 } }
+      - { tp: 2, conc: { start: 64, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      8k1k:
+      - { tp: [ 1, 2 ], conc: { start: 16, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+    b200:
+      1k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: 2, conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 1, 2 ], conc: { start: 64, end: 64 } }
+      - { tp: 4, conc: { start: 16, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 1, conc: { start: 32, end: 64 } }
+      - { tp: 2, conc: { start: 16, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+    b200-trt:
+      1k1k:
+      - { tp: 1, conc: { start: 128, end: 128 } }
+      - { tp: 2, conc: { start: 64, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      1k8k:
+      - { tp: 1, conc: { start: 128, end: 128 } }
+      - { tp: 2, conc: { start: 64, end: 128 } }
+      - { tp: 4, conc: { start: 16, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      8k1k:
+      - { tp: 1, conc: { start: 32, end: 128 } }
+      - { tp: 2, conc: { start: 16, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+    mi300x:
+      1k1k:
+      - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 1, 2 ], conc: { start: 64, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+    mi325x:
+      1k1k:
+      - { tp: [ 1, 2, 4 ], conc: { start: 32, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } }
+      - { tp: 4, conc: { start: 64, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 1, conc: { start: 16, end: 64 } }
+      - { tp: 2, conc: { start: 4, end: 32 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+    mi355x:
+      1k1k:
+      - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+
+dsr1:
+  fp4:
+    b200:
+      1k1k:
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 128 } }
+      1k8k:
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 128 } }
+      8k1k:
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+    b200-trt:
+      1k1k:
+      - { tp: 4, conc: { start: 4, end: 32 } }
+      - { tp: 4, ep: 4, conc: { start: 64, end: 128 } }
+      - { tp: 4, ep: 4, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } }
+      - { tp: 8, conc: { start: 4, end: 8 } }
+      - { tp: 8, ep: 8, conc: { start: 16, end: 128 } }
+      - { tp: 8, ep: 8, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } }
+      1k8k:
+      - { tp: 4, conc: { start: 4, end: 32 } }
+      - { tp: 4, ep: 4, conc: { start: 64, end: 128 } }
+      - { tp: 4, ep: 4, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      - { tp: 8, ep: 8, conc: { start: 32, end: 128 } }
+      - { tp: 8, ep: 8, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } }
+      8k1k:
+      - { tp: 4, conc: { start: 4, end: 32 } }
+      - { tp: 4, ep: 4, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 64, end: 256 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      - { tp: 8, ep: 8, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 64, end: 256 } }
+    gb200:
+      1k1k:
+      - { tp: 12, conc: { start: 4300, end: 4300 } }
+      - { tp: 24, conc: { start: 4300, end: 4300 } }
+      - { tp: 24, conc: { start: 2048, end: 2048 } }
+      - { tp: 20, conc: { start: 1075, end: 1075 } }
+      - { tp: 36, conc: { start: 1075, end: 1075 } }
+      - { tp: 36, conc: { start: 564, end: 564 } }
+      - { tp: 36, conc: { start: 4, end: 256 } }
+      8k1k:
+      - { tp: 28, conc: { start: 2150, end: 2150 } }
+      - { tp: 48, conc: { start: 2150, end: 2150 } }
+      - { tp: 40, conc: { start: 1075, end: 1075 } }
+      - { tp: 48, conc: { start: 538, end: 538 } }
+      - { tp: 48, conc: { start: 256, end: 256 } }
+      - { tp: 28, conc: { start: 102, end: 102 } }
+      - { tp: 28, conc: { start: 3, end: 48 } }
+    gb200-mtp:
+      1k1k:
+      - { tp: 12, conc: { start: 2252, end: 2252 } }
+      - { tp: 24, conc: { start: 2150, end: 2150 } }
+      - { tp: 20, conc: { start: 1075, end: 1075 } }
+      - { tp: 20, conc: { start: 512, end: 512 } }
+      - { tp: 36, conc: { start: 512, end: 512 } }
+      - { tp: 36, conc: { start: 144, end: 144 } }
+      - { tp: 36, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 32, conc: { start: 2150, end: 2150 } }
+      - { tp: 48, conc: { start: 1075, end: 1075 } }
+      - { tp: 64, conc: { start: 538, end: 538 } }
+      - { tp: 52, conc: { start: 269, end: 269 } }
+      - { tp: 52, conc: { start: 128, end: 128 } }
+      - { tp: 28, conc: { start: 54, end: 54 } }
+      - { tp: 28, conc: { start: 3, end: 24 } }
+    mi355x:
+      1k1k:
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+  fp8:
+    h200:
+      1k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+    h200-trt:
+      1k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+    b200:
+      1k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+    b200-trt:
+      1k1k:
+      - { tp: 8, ep: 8, conc: { start: 4, end: 32 } }
+      1k8k:
+      - { tp: 8, ep: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, ep: 8, conc: { start: 4, end: 32 } }
+      - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 64 } }
+    gb200:
+      1k1k:
+      - { tp: 72, conc: { start: 8192, end: 8192 } }
+      - { tp: 72, conc: { start: 6144, end: 6144 } }
+      - { tp: 72, conc: { start: 5632, end: 5632 } }
+      - { tp: 72, conc: { start: 5376, end: 5376 } }
+      - { tp: 72, conc: { start: 5120, end: 5120 } }
+      - { tp: 72, conc: { start: 4992, end: 4992 } }
+      - { tp: 72, conc: { start: 4864, end: 4864 } }
+      - { tp: 72, conc: { start: 4608, end: 4608 } }
+      - { tp: 72, conc: { start: 1024, end: 4096 } }
+      8k1k:
+      - { tp: 72, conc: { start: 128, end: 4096 } }
+      - { tp: 72, conc: { start: 576, end: 576 } }
+      - { tp: 72, conc: { start: 448, end: 448 } }
+      - { tp: 72, conc: { start: 384, end: 384 } }
+    mi300x:
+      1k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+    mi325x:
+      1k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+    mi355x:
+      1k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index 3d1dd5051..cb9776b64 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -54,7 +54,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[2, 4, 8]'
 
   bmk-h200-fp8:
     if: ${{ inputs.use_h200 }}
@@ -71,7 +70,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
 
   bmk-h200-trt-fp8:
     if: ${{ inputs.use_h200 }}
@@ -88,8 +86,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'  
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # H200 can achieve TPS/User >= 30 with larger concurrency till 128
 
   bmk-b200-fp8:
     if: ${{ inputs.use_b200 }}
@@ -106,7 +102,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
 
   bmk-b200-trt-fp8:
     if: ${{ inputs.use_b200 }}
@@ -123,8 +118,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # B200 can achieve TPS/User >= 30 with larger concurrency till 256
 
   bmk-mi300x-fp8:
     if: ${{ inputs.use_mi300x }}
@@ -141,7 +134,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
 
   bmk-mi325x-fp8:
     if: ${{ inputs.use_mi325x }}
@@ -158,7 +150,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
 
   bmk-mi355x-fp8:
     if: ${{ inputs.use_mi355x }}
@@ -175,7 +166,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
 
   bmk-b200-fp4:
     if: ${{ inputs.use_b200 }}
@@ -192,7 +182,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'  # fix: add TP=2,4 to B200, just as mi355 has
 
   bmk-b200-trt-fp4:
     if: ${{ inputs.use_b200 }}
@@ -209,8 +198,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # B200 can achieve TPS/User >= 30 with larger concurrency till 128
 
   bmk-mi355x-fp4:
     if: ${{ inputs.use_mi355x }}
@@ -226,5 +213,4 @@ jobs:
       isl: ${{ inputs.isl }}
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
+      random-range-ratio: ${{ inputs.random-range-ratio }}
\ No newline at end of file
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 313087946..03d007f5c 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -32,17 +32,12 @@ on:
       random-range-ratio:
         required: true
         type: string
-      tp-list:
-        required: true
-        type: string
-      conc-list:
-        type: string
-        default: '[4, 8, 16, 32, 64]'
 
 env:
   HF_TOKEN: ${{ secrets.HF_TOKEN }}
   HF_HUB_CACHE: '/mnt/hf_hub_cache/'
   EXP_NAME: ${{ inputs.exp-name }}
+  RUNNER: ${{ inputs.runner }}
   MODEL: ${{ inputs.model }}
   ISL: ${{ inputs.isl }}
   OSL: ${{ inputs.osl }}
@@ -53,22 +48,41 @@ env:
   PRECISION: ${{ inputs.precision }}
 
 jobs:
+  flatten-search-space-matrix:
+    runs-on: ubuntu-latest
+    outputs:
+      flattened-matrix: ${{ steps.flatten.outputs.flattened-matrix }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+          
+      - id: flatten
+        run: python3 ${GITHUB_WORKSPACE}/utils/flatten_matrix.py
+
   benchmark:
     runs-on: ${{ inputs.runner }}
+    needs: flatten-search-space-matrix
     timeout-minutes: 180
 
     strategy:
       fail-fast: false
       matrix:
-        tp: ${{ fromJson(inputs.tp-list) }}
-        conc: ${{ fromJson(inputs.conc-list) }}
-    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.tp }} conc${{ matrix.conc }}'
+        config: ${{ fromJson(needs.flatten-search-space-matrix.outputs.flattened-matrix) }}
+    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.config.tp }} conc${{ matrix.config.conc }}'
 
     env:
-      TP: ${{ matrix.tp }}
-      CONC: ${{ matrix.conc }}
+      TP: ${{ matrix.config.tp }}
+      CONC: ${{ matrix.config.conc }}
+      EP_SIZE: ${{ matrix.config.ep || 1 }}
+      DP_ATTENTION: ${{ matrix.config.dp_attention || 'false' }}
+      MOE_BACKEND: ${{ matrix.config.moe_backend || 'NONE' }}
 
     steps:
+      - name: debug
+        run: echo "${{ matrix.config }}"
+
       - name: Resource cleanup
         run: |
           if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then
diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml
index 3a48710f2..64dfe4ec4 100644
--- a/.github/workflows/dsr1-tmpl.yml
+++ b/.github/workflows/dsr1-tmpl.yml
@@ -55,7 +55,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
 
   bmk-h200-trt-fp8:
     if: ${{ inputs.use_h200 }}
@@ -72,7 +71,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
 
   bmk-b200-fp8:
     if: ${{ inputs.use_b200 }}
@@ -89,7 +87,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
 
   bmk-b200-trt-fp8:
     if: ${{ inputs.use_b200 }}
@@ -106,7 +103,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
 
   bmk-mi300x-fp8:
     if: ${{ inputs.use_mi300x }}
@@ -123,7 +119,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
 
   bmk-mi325x-fp8:
     if: ${{ inputs.use_mi325x }}
@@ -140,7 +135,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
 
   bmk-mi355x-fp8:
     if: ${{ inputs.use_mi355x }}
@@ -157,7 +151,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
 
   bmk-b200-fp4:
     if: ${{ inputs.use_b200 }}
@@ -174,8 +167,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[4,8]'
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # Custom concurrency values for this job
 
   bmk-b200-trt-fp4:
     if: ${{ inputs.use_b200 }}
@@ -192,8 +183,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[4, 8]'
-      conc-list: '[4, 8, 16, 32, 64, 128, 256]'  # DPA4EP4 is already 30 tok/s/user and DPA8EP8 is already 35tok/s/user. 512 conc would be too much so we skipping it
 
   bmk-mi355x-fp4:
     if: ${{ inputs.use_mi355x }}
@@ -210,8 +199,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      # These tensor parallelism settings are not necessary as they cannot fall on the Pareto frontier with this particular container - we remove them to save CI time.
-      tp-list: ${{ inputs.isl == 1024 && inputs.osl == 1024 && '[4, 8]' || '[8]' }}
 
   bmk-gb200-fp4-multinode-mtp-off:
     if: ${{ inputs.use_gb200 && !(inputs.isl == '1024' && inputs.osl == '8192') }}
diff --git a/.github/workflows/gptoss-tmpl.yml b/.github/workflows/gptoss-tmpl.yml
index 0c505de07..03a205ff5 100644
--- a/.github/workflows/gptoss-tmpl.yml
+++ b/.github/workflows/gptoss-tmpl.yml
@@ -52,7 +52,6 @@ jobs:
       runner: h100
       image: 'vllm/vllm-openai:v0.10.2'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[2, 4, 8]'
       framework: 'vllm'
       precision: 'fp4'
 
@@ -69,7 +68,6 @@ jobs:
       runner: h200
       image: 'vllm/vllm-openai:v0.10.2'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
       framework: 'vllm'
       precision: 'fp4'
 
@@ -86,7 +84,6 @@ jobs:
       runner: b200
       image: 'vllm/vllm-openai:v0.10.2'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
       framework: 'vllm'
       precision: 'fp4'
 
@@ -103,7 +100,6 @@ jobs:
       runner: b200-trt
       image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
       framework: 'trt'
       precision: 'fp4'
 
@@ -120,7 +116,6 @@ jobs:
       runner: h200-trt
       image: 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
       framework: 'trt'
       precision: 'fp4'
 
@@ -137,7 +132,6 @@ jobs:
       runner: mi300x
       image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
       framework: 'vllm'
       precision: 'fp4'
 
@@ -154,7 +148,6 @@ jobs:
       runner: mi325x
       image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
       framework: 'vllm'
       precision: 'fp4'
 
@@ -171,6 +164,5 @@ jobs:
       runner: mi355x
       image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 4, 8]'
       framework: 'vllm'
       precision: 'fp4'
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index ffdae541c..dbffe8494 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -13,69 +13,77 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
+# EP_SIZE
+# DP_ATTENTION
+# MOE_BACKEND
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+# Default backend is TRTLLM
+if [[ "MOE_BACKEND" == "NONE" ]]; then
+    MOE_BACKEND="TRTLLM"
+fi
 
-hf download $MODEL
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
 
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="1"
-MOE_BACKEND="TRTLLM"
-DP_ATTENTION=false
-
-if [[ "$TP" == "4" ]]; then
-    if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-        if [[ $CONC -gt 32 ]]; then
-            EP_SIZE="$TP"
-        fi
-        if [[ $CONC -ge 256 ]]; then
-            DP_ATTENTION=true
-            MOE_BACKEND="CUTLASS"
-        fi
-    elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-        if [[ $CONC -gt 32 ]]; then
-            EP_SIZE="$TP"
-        fi
-        if [[ $CONC -ge 256 ]]; then
-            DP_ATTENTION=true
-            MOE_BACKEND="CUTLASS"
-        fi
-    elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-        if [[ $CONC -gt 32 ]]; then
-            EP_SIZE="$TP"
-            DP_ATTENTION=true
-            MOE_BACKEND="CUTLASS"
-        fi
-    fi
-elif [[ "$TP" == "8" ]]; then
-    if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-        if [[ $CONC -gt 8 ]]; then
-            EP_SIZE="$TP"
-        fi
-        if [[ $CONC -ge 256 ]]; then
-            DP_ATTENTION=true
-            MOE_BACKEND="CUTLASS"
-        fi
-    elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-        if [[ $CONC -gt 16 ]]; then
-            EP_SIZE="$TP"
-        fi
-        if [[ $CONC -ge 256 ]]; then
-            DP_ATTENTION=true
-            MOE_BACKEND="CUTLASS"
-        fi
-    elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-        if [[ $CONC -gt 32 ]]; then
-            EP_SIZE="$TP"
-            DP_ATTENTION=true
-            MOE_BACKEND="CUTLASS"
-        fi
-    fi
-fi
+hf download $MODEL
 
-echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
+# # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
+# EP_SIZE="1"
+# MOE_BACKEND="TRTLLM"
+# DP_ATTENTION=false
+
+# if [[ "$TP" == "4" ]]; then
+#     if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+#         if [[ $CONC -gt 32 ]]; then
+#             EP_SIZE="$TP"
+#         fi
+#         if [[ $CONC -ge 256 ]]; then
+#             DP_ATTENTION=true
+#             MOE_BACKEND="CUTLASS"
+#         fi
+#     elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+#         if [[ $CONC -gt 32 ]]; then
+#             EP_SIZE="$TP"
+#         fi
+#         if [[ $CONC -ge 256 ]]; then
+#             DP_ATTENTION=true
+#             MOE_BACKEND="CUTLASS"
+#         fi
+#     elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+#         if [[ $CONC -gt 32 ]]; then
+#             EP_SIZE="$TP"
+#             DP_ATTENTION=true
+#             MOE_BACKEND="CUTLASS"
+#         fi
+#     fi
+# elif [[ "$TP" == "8" ]]; then
+#     if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+#         if [[ $CONC -gt 8 ]]; then
+#             EP_SIZE="$TP"
+#         fi
+#         if [[ $CONC -ge 256 ]]; then
+#             DP_ATTENTION=true
+#             MOE_BACKEND="CUTLASS"
+#         fi
+#     elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+#         if [[ $CONC -gt 16 ]]; then
+#             EP_SIZE="$TP"
+#         fi
+#         if [[ $CONC -ge 256 ]]; then
+#             DP_ATTENTION=true
+#             MOE_BACKEND="CUTLASS"
+#         fi
+#     elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+#         if [[ $CONC -gt 32 ]]; then
+#             EP_SIZE="$TP"
+#             DP_ATTENTION=true
+#             MOE_BACKEND="CUTLASS"
+#         fi
+#     fi
+# fi
+
+# echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index e909b954a..67dbb8396 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -13,31 +13,39 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
+# EP_SIZE
+# DP_ATTENTION
+# MOE_BACKEND
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+# Default backend is DEEPGEMM
+if [[ "MOE_BACKEND" == "NONE" ]]; then
+    MOE_BACKEND="DEEPGEMM"
+fi
 
-hf download $MODEL
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
 
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="$TP"
-MOE_BACKEND="DEEPGEMM"
-DP_ATTENTION=false
+hf download $MODEL
 
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -gt 32 ]]; then
-        DP_ATTENTION=true
-    fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC -gt 64 ]]; then
-        DP_ATTENTION=true
-    fi
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -gt 64 ]]; then
-        DP_ATTENTION=true
-    fi
-fi
+# # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
+# EP_SIZE="$TP"
+# MOE_BACKEND="DEEPGEMM"
+# DP_ATTENTION=false
+
+# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+#     if [[ $CONC -gt 32 ]]; then
+#         DP_ATTENTION=true
+#     fi
+# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+#     if [[ $CONC -gt 64 ]]; then
+#         DP_ATTENTION=true
+#     fi
+# elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+#     if [[ $CONC -gt 64 ]]; then
+#         DP_ATTENTION=true
+#     fi
+# fi
 
 echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
 
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 20101e466..a71068867 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -13,31 +13,39 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
+# EP_SIZE
+# DP_ATTENTION
+# MOE_BACKEND
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+# Default backend is CUTLASS
+if [[ "MOE_BACKEND" == "NONE" ]]; then
+    MOE_BACKEND="CUTLASS"
+fi
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
 
 hf download $MODEL
 
 # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="$TP"
-MOE_BACKEND="CUTLASS"
-DP_ATTENTION=false
-
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -gt 64 ]]; then
-        DP_ATTENTION=true
-    fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC -gt 64 ]]; then
-        DP_ATTENTION=true
-    fi
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -gt 32 ]]; then
-        DP_ATTENTION=true
-    fi
-fi
+# EP_SIZE="$TP"
+# MOE_BACKEND="CUTLASS"
+# DP_ATTENTION=false
+
+# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+#     if [[ $CONC -gt 64 ]]; then
+#         DP_ATTENTION=true
+#     fi
+# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+#     if [[ $CONC -gt 64 ]]; then
+#         DP_ATTENTION=true
+#     fi
+# elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+#     if [[ $CONC -gt 32 ]]; then
+#         DP_ATTENTION=true
+#     fi
+# fi
 
 echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
 
@@ -49,7 +57,7 @@ cat > $EXTRA_CONFIG_FILE << EOF
 cuda_graph_config:
     enable_padding: true
     max_batch_size: 128
-enable_attention_dp: $DP_ATTENTION
+ enable_attention_dp: $DP_ATTENTION
 print_iter_log: true
 kv_cache_config:
     dtype: fp8
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index 96745306a..9ffa26659 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -13,48 +13,56 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
+# EP_SIZE
+# DP_ATTENTION
+# MOE_BACKEND
 
 # GPTOSS TRTLLM Deployment Guide:
 # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+# Default backend is TRTLLM
+if [[ "MOE_BACKEND" == "NONE" ]]; then
+    MOE_BACKEND="TRTLLM"
+fi
+
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
 
 hf download $MODEL
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 
 # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="1"
-MOE_BACKEND="TRTLLM"
-DP_ATTENTION=false
+# EP_SIZE="1"
+# MOE_BACKEND="TRTLLM"
+# DP_ATTENTION=false
 
 # Lower concurrencies: Concurrency < 256
 # MoE backend=TRTLLM
 # Use TP Attention; Switch to MoE Expert parallel for conurrency >=16 (1k1k and 1k8k)
-TEP_REQUIRED=false
-if [[ "$TP" == "4" || "$TP" == "8" ]]; then 
-    if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-        TEP_REQUIRED=true
-    elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-        TEP_REQUIRED=true
-    fi
-fi
-if [[ "$TEP_REQUIRED" == "true" && $CONC -ge 16 ]]; then
-    EP_SIZE="$TP"
-fi
+# TEP_REQUIRED=false
+# if [[ "$TP" == "4" || "$TP" == "8" ]]; then 
+#     if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+#         TEP_REQUIRED=true
+#     elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+#         TEP_REQUIRED=true
+#     fi
+# fi
+# if [[ "$TEP_REQUIRED" == "true" && $CONC -ge 16 ]]; then
+#     EP_SIZE="$TP"
+# fi
 
 # Higher concurrencies: Concurrency >= 256
 #   MoE Backend = CUTLASS
 #   Use DP attention with expert parallel MoE
-if [[ $CONC -ge 256 ]]; then
-    EP_SIZE="$TP"
-    DP_ATTENTION=true
-    MOE_BACKEND="CUTLASS"
-fi
+# if [[ $CONC -ge 256 ]]; then
+#     EP_SIZE="$TP"
+#     DP_ATTENTION=true
+#     MOE_BACKEND="CUTLASS"
+# fi
 
-echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
+# echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
 
 EXTRA_CONFIG_FILE="gptoss-fp4.yml"
 export TRTLLM_ENABLE_PDL=1
diff --git a/utils/flatten_matrix.py b/utils/flatten_matrix.py
new file mode 100644
index 000000000..a46bce549
--- /dev/null
+++ b/utils/flatten_matrix.py
@@ -0,0 +1,73 @@
+import yaml
+import os
+import json
+
+with open('.github/configs/search-space.yml', 'r') as f:
+    search_space = yaml.safe_load(f)
+
+seq_len_map = {
+    '1024': '1k',
+    '8192': '8k',
+}
+
+runner = os.environ['RUNNER']
+model = os.environ['MODEL']
+isl = os.environ['ISL']
+osl = os.environ['OSL']
+precision = os.environ['PRECISION']
+
+# In the workflows, not all model references are the same so do a sort
+# of partial matching to map to the search space keys
+model_map = {
+    'gpt-oss': 'gptoss',
+    'llama-3.3-70b-instruct': 'llama',
+    'deepseek-r1-0528': 'dsr1',
+}
+
+model_key = None
+model_lower = model.lower()
+for key, value in model_map.items():
+    if key.lower() in model_lower:
+        model_key = value
+        break
+
+if model_key is None:
+    raise ValueError(f"Model '{model}' is not recognized.")
+
+seq_len = f"{seq_len_map[isl]}{seq_len_map[osl]}"
+
+if isinstance(search_space, list):
+    entries = search_space
+else:
+    entries = search_space.get(model_key, {}).get(
+        precision, {}).get(runner, {}).get(seq_len, [])
+
+flattened_search_space = []
+for entry in entries:
+    tp_list = entry['tp'] if isinstance(
+        entry.get('tp'), list) else [entry.get('tp')]
+
+    conc_config = entry.get('conc')
+    if isinstance(conc_config, dict):
+        start = conc_config['start']
+        end = conc_config['end']
+        step_factor = conc_config.get('step', 2)
+        conc_list = []
+        current = start
+        while current <= end:
+            conc_list.append(current)
+            current *= step_factor
+    elif isinstance(conc_config, list):
+        conc_list = conc_config
+    else:
+        conc_list = [conc_config]
+
+    for tp_value in tp_list:
+        for conc_value in conc_list:
+            new_entry = entry.copy()
+            new_entry['tp'] = tp_value
+            new_entry['conc'] = conc_value
+            flattened_search_space.append(new_entry)
+
+with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
+    f.write(f"flattened-matrix={json.dumps(flattened_search_space)}\n")

From 683aba7eac106fb18308d7fbbf88abeda3673b9c Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 16 Oct 2025 23:04:12 -0500
Subject: [PATCH 02/17] getting rid of moe backend logic; adding comments

---
 .github/configs/search-space.yml        | 60 ++++++++++++----
 .github/workflows/benchmark-tmpl.yml    |  1 -
 benchmarks/dsr1_fp4_b200_trt_slurm.sh   | 92 ++++++++++---------------
 benchmarks/dsr1_fp8_b200_trt_slurm.sh   | 27 +-------
 benchmarks/dsr1_fp8_h200_trt_slurm.sh   | 29 +-------
 benchmarks/gptoss_fp4_b200_trt_slurm.sh | 43 +++---------
 6 files changed, 95 insertions(+), 157 deletions(-)

diff --git a/.github/configs/search-space.yml b/.github/configs/search-space.yml
index 7c1564909..34ed9f268 100644
--- a/.github/configs/search-space.yml
+++ b/.github/configs/search-space.yml
@@ -46,17 +46,25 @@ gptoss:
       - { tp: 1, conc: { start: 4, end: 64 } }
       - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } }
     b200-trt:
+      # NOTE: Regardless of TP, if CONC >= 256, DP_ATTENTION should be set
+      # to true and EP should be set to TP, i.e.,
+      # For lower concurrencies (CONC < 256), use TP Attention; Switch to 
+      # MoE Expert parallel for conurrency >=16 (1k1k and 1k8k)
       1k1k:
       - { tp: 1, conc: { start: 64, end: 64 } }
       - { tp: 2, conc: { start: 4, end: 64 } }
+      # EP=4 iff TP=4 and CONC >= 16
       - { tp: 4, conc: { start: 4, end: 8 } }
       - { tp: 4, ep: 4, conc: { start: 16, end: 64 } }
+      # EP=8 iff TP=8 and CONC >= 16
       - { tp: 8, conc: { start: 4, end: 8 } }
       1k8k:
       - { tp: 1, conc: { start: 64, end: 64 } }
       - { tp: 2, conc: { start: 4, end: 64 } }
+      # EP=4 iff TP=4 and CONC >= 16
       - { tp: 4, conc: { start: 4, end: 8 } }
       - { tp: 4, ep: 4, conc: { start: 16, end: 64 } }
+      # EP=8 iff TP=8 and CONC >= 16
       - { tp: 8, conc: { start: 4, end: 8 } }
       8k1k:
       - { tp: 1, conc: { start: 64, end: 64 } }
@@ -252,25 +260,45 @@ dsr1:
       - { tp: 4, conc: { start: 4, end: 128 } }
       - { tp: 8, conc: { start: 4, end: 16 } }
     b200-trt:
+      # Determine DP_ATTENTION, and EP_SIZE based on ISL, OSL, CONC
+      # For ISL/OSL = 1k/1k
       1k1k:
+      # If TP=4,
+      #   EP_SIZE=4 iff CONC > 32
+      #   DP_ATTENTION=true iff CONC >= 256
       - { tp: 4, conc: { start: 4, end: 32 } }
       - { tp: 4, ep: 4, conc: { start: 64, end: 128 } }
-      - { tp: 4, ep: 4, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } }
+      - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 256, end: 256 } }
+      # If TP=8,
+      #   EP_SIZE=8 iff CONC > 8
+      #   DP_ATTENTION=true iff CONC >= 256
       - { tp: 8, conc: { start: 4, end: 8 } }
       - { tp: 8, ep: 8, conc: { start: 16, end: 128 } }
-      - { tp: 8, ep: 8, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } }
+      - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 256, end: 256 } }
+      # For ISL/OSL = 1k/8k
       1k8k:
+      # If TP=4,
+      #   EP_SIZE=4 iff CONC > 32
+      #   DP_ATTENTION=true iff CONC >= 256
       - { tp: 4, conc: { start: 4, end: 32 } }
       - { tp: 4, ep: 4, conc: { start: 64, end: 128 } }
-      - { tp: 4, ep: 4, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } }
+      - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 256, end: 256 } }
+      # If TP=8,
+      #   EP_SIZE=8 iff CONC > 16
+      #   DP_ATTENTION=true iff CONC >= 256
       - { tp: 8, conc: { start: 4, end: 16 } }
       - { tp: 8, ep: 8, conc: { start: 32, end: 128 } }
-      - { tp: 8, ep: 8, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } }
+      - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 256, end: 256 } }
+      # For ISL/OSL = 8k/1k
       8k1k:
+      # If TP=4,
+      #   EP_SIZE=4 and DP_ATTENTION=true iff CONC > 32
       - { tp: 4, conc: { start: 4, end: 32 } }
-      - { tp: 4, ep: 4, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 64, end: 256 } }
+      - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 64, end: 256 } }
+      # If TP=8,
+      #   EP_SIZE=8 and DP_ATTENTION=true iff CONC > 32
       - { tp: 8, conc: { start: 4, end: 32 } }
-      - { tp: 8, ep: 8, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 64, end: 256 } }
+      - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 256 } }
     gb200:
       1k1k:
       - { tp: 12, conc: { start: 4300, end: 4300 } }
@@ -321,13 +349,17 @@ dsr1:
       8k1k:
       - { tp: 8, conc: { start: 4, end: 64 } }
     h200-trt:
+      # Determine DP_ATTENTION and EP_SIZE based on ISL, OSL, CONC
+      # If ISL/OSL = 1k/1k, DP_ATTENTION=true iff CONC > 64
       1k1k:
-      - { tp: 8, conc: { start: 4, end: 64 } }
-      - { tp: 8, conc: { start: 4, end: 64 } }
+      - { tp: 8, ep: 8, conc: { start: 4, end: 64 } }
+      # If ISL/OSL = 1k/8k, DP_ATTENTION=true iff CONC > 64
       1k8k:
-      - { tp: 8, conc: { start: 4, end: 64 } }
+      - { tp: 8, ep: 8, conc: { start: 4, end: 64 } }
+      # If ISL/OSL = 8k/1k, DP_ATTENTION=true iff CONC > 32
       8k1k:
-      - { tp: 8, conc: { start: 4, end: 64 } }
+      - { tp: 8, ep: 8, conc: { start: 4, end: 32 } }
+      - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 64 } }
     b200:
       1k1k:
       - { tp: 8, conc: { start: 4, end: 64 } }
@@ -336,13 +368,17 @@ dsr1:
       8k1k:
       - { tp: 8, conc: { start: 4, end: 64 } }
     b200-trt:
+      # Determine DP_ATTENTION and EP_SIZE based on ISL, OSL, CONC
+      # If ISL/OSL = 1k/1k, DP_ATTENTION=true iff CONC > 32
       1k1k:
       - { tp: 8, ep: 8, conc: { start: 4, end: 32 } }
+      - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 64 } }
+      # If ISL/OSL = 1k/8k, DP_ATTENTION=true iff CONC > 64
       1k8k:
       - { tp: 8, ep: 8, conc: { start: 4, end: 64 } }
+      # If ISL/OSL = 8k/1k, DP_ATTENTION=true iff CONC > 64
       8k1k:
-      - { tp: 8, ep: 8, conc: { start: 4, end: 32 } }
-      - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 64 } }
+      - { tp: 8, ep: 8, conc: { start: 4, end: 64 } }
     gb200:
       1k1k:
       - { tp: 72, conc: { start: 8192, end: 8192 } }
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 03d007f5c..a027bda8b 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -77,7 +77,6 @@ jobs:
       CONC: ${{ matrix.config.conc }}
       EP_SIZE: ${{ matrix.config.ep || 1 }}
       DP_ATTENTION: ${{ matrix.config.dp_attention || 'false' }}
-      MOE_BACKEND: ${{ matrix.config.moe_backend || 'NONE' }}
 
     steps:
       - name: debug
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index dbffe8494..215de86e5 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -15,7 +15,6 @@
 # PORT_OFFSET
 # EP_SIZE
 # DP_ATTENTION
-# MOE_BACKEND
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
@@ -28,62 +27,41 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO
 
 hf download $MODEL
 
-# # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-# EP_SIZE="1"
-# MOE_BACKEND="TRTLLM"
-# DP_ATTENTION=false
-
-# if [[ "$TP" == "4" ]]; then
-#     if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-#         if [[ $CONC -gt 32 ]]; then
-#             EP_SIZE="$TP"
-#         fi
-#         if [[ $CONC -ge 256 ]]; then
-#             DP_ATTENTION=true
-#             MOE_BACKEND="CUTLASS"
-#         fi
-#     elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-#         if [[ $CONC -gt 32 ]]; then
-#             EP_SIZE="$TP"
-#         fi
-#         if [[ $CONC -ge 256 ]]; then
-#             DP_ATTENTION=true
-#             MOE_BACKEND="CUTLASS"
-#         fi
-#     elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-#         if [[ $CONC -gt 32 ]]; then
-#             EP_SIZE="$TP"
-#             DP_ATTENTION=true
-#             MOE_BACKEND="CUTLASS"
-#         fi
-#     fi
-# elif [[ "$TP" == "8" ]]; then
-#     if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-#         if [[ $CONC -gt 8 ]]; then
-#             EP_SIZE="$TP"
-#         fi
-#         if [[ $CONC -ge 256 ]]; then
-#             DP_ATTENTION=true
-#             MOE_BACKEND="CUTLASS"
-#         fi
-#     elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-#         if [[ $CONC -gt 16 ]]; then
-#             EP_SIZE="$TP"
-#         fi
-#         if [[ $CONC -ge 256 ]]; then
-#             DP_ATTENTION=true
-#             MOE_BACKEND="CUTLASS"
-#         fi
-#     elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-#         if [[ $CONC -gt 32 ]]; then
-#             EP_SIZE="$TP"
-#             DP_ATTENTION=true
-#             MOE_BACKEND="CUTLASS"
-#         fi
-#     fi
-# fi
-
-# echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
+# # ========= Determine MOE_BACKEND based on ISL, OSL, CONC =========
+# Default
+MOE_BACKEND="TRTLLM"
+
+if [[ "$TP" == "4" ]]; then
+    if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+        if [[ $CONC -ge 256 ]]; then
+            MOE_BACKEND="CUTLASS"
+        fi
+    elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+        if [[ $CONC -ge 256 ]]; then
+            MOE_BACKEND="CUTLASS"
+        fi
+    elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+        if [[ $CONC -gt 32 ]]; then
+            MOE_BACKEND="CUTLASS"
+        fi
+    fi
+elif [[ "$TP" == "8" ]]; then
+    if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
+        if [[ $CONC -ge 256 ]]; then
+            MOE_BACKEND="CUTLASS"
+        fi
+    elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
+        if [[ $CONC -ge 256 ]]; then
+            MOE_BACKEND="CUTLASS"
+        fi
+    elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
+        if [[ $CONC -gt 32 ]]; then
+            MOE_BACKEND="CUTLASS"
+        fi
+    fi
+fi
+
+echo "MOE_BACKEND set to '$MOE_BACKEND'"
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index 67dbb8396..6bc8c9fa7 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -15,40 +15,15 @@
 # PORT_OFFSET
 # EP_SIZE
 # DP_ATTENTION
-# MOE_BACKEND
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-# Default backend is DEEPGEMM
-if [[ "MOE_BACKEND" == "NONE" ]]; then
-    MOE_BACKEND="DEEPGEMM"
-fi
+MOE_BACKEND="DEEPGEMM"
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
 
 hf download $MODEL
 
-# # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-# EP_SIZE="$TP"
-# MOE_BACKEND="DEEPGEMM"
-# DP_ATTENTION=false
-
-# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-#     if [[ $CONC -gt 32 ]]; then
-#         DP_ATTENTION=true
-#     fi
-# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-#     if [[ $CONC -gt 64 ]]; then
-#         DP_ATTENTION=true
-#     fi
-# elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-#     if [[ $CONC -gt 64 ]]; then
-#         DP_ATTENTION=true
-#     fi
-# fi
-
-echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
-
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 EXTRA_CONFIG_FILE="dsr1-fp8.yml"
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index a71068867..3552fac78 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -15,40 +15,15 @@
 # PORT_OFFSET
 # EP_SIZE
 # DP_ATTENTION
-# MOE_BACKEND
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-# Default backend is CUTLASS
-if [[ "MOE_BACKEND" == "NONE" ]]; then
-    MOE_BACKEND="CUTLASS"
-fi
+MOE_BACKEND="CUTLASS"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND: $MOE_BACKEND"
 
 hf download $MODEL
 
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-# EP_SIZE="$TP"
-# MOE_BACKEND="CUTLASS"
-# DP_ATTENTION=false
-
-# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-#     if [[ $CONC -gt 64 ]]; then
-#         DP_ATTENTION=true
-#     fi
-# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-#     if [[ $CONC -gt 64 ]]; then
-#         DP_ATTENTION=true
-#     fi
-# elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-#     if [[ $CONC -gt 32 ]]; then
-#         DP_ATTENTION=true
-#     fi
-# fi
-
-echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
-
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 EXTRA_CONFIG_FILE="dsr1-fp8.yml"
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index 9ffa26659..4f17d4d4f 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -15,54 +15,29 @@
 # PORT_OFFSET
 # EP_SIZE
 # DP_ATTENTION
-# MOE_BACKEND
 
 # GPTOSS TRTLLM Deployment Guide:
 # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-# Default backend is TRTLLM
-if [[ "MOE_BACKEND" == "NONE" ]]; then
-    MOE_BACKEND="TRTLLM"
-fi
-
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION"
 
 hf download $MODEL
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-# EP_SIZE="1"
-# MOE_BACKEND="TRTLLM"
-# DP_ATTENTION=false
-
-# Lower concurrencies: Concurrency < 256
-# MoE backend=TRTLLM
-# Use TP Attention; Switch to MoE Expert parallel for conurrency >=16 (1k1k and 1k8k)
-# TEP_REQUIRED=false
-# if [[ "$TP" == "4" || "$TP" == "8" ]]; then 
-#     if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-#         TEP_REQUIRED=true
-#     elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-#         TEP_REQUIRED=true
-#     fi
-# fi
-# if [[ "$TEP_REQUIRED" == "true" && $CONC -ge 16 ]]; then
-#     EP_SIZE="$TP"
-# fi
+# ========= Determine MOE_BACKEND based on ISL, OSL, CONC =========
+# Default
+MOE_BACKEND="TRTLLM"
 
 # Higher concurrencies: Concurrency >= 256
 #   MoE Backend = CUTLASS
-#   Use DP attention with expert parallel MoE
-# if [[ $CONC -ge 256 ]]; then
-#     EP_SIZE="$TP"
-#     DP_ATTENTION=true
-#     MOE_BACKEND="CUTLASS"
-# fi
-
-# echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
+if [[ $CONC -ge 256 ]]; then
+    MOE_BACKEND="CUTLASS"
+fi
+
+echo "MOE_BACKEND set to $MOE_BACKEND"
 
 EXTRA_CONFIG_FILE="gptoss-fp4.yml"
 export TRTLLM_ENABLE_PDL=1

From a1f7b73cdb1591bc25f0934b912e5a4212ebadb3 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Thu, 16 Oct 2025 23:08:37 -0500
Subject: [PATCH 03/17] fixes; addming back explanatory comments

---
 .github/configs/search-space.yml      | 7 +++++++
 benchmarks/dsr1_fp4_b200_trt_slurm.sh | 2 +-
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/.github/configs/search-space.yml b/.github/configs/search-space.yml
index 34ed9f268..f89643d14 100644
--- a/.github/configs/search-space.yml
+++ b/.github/configs/search-space.yml
@@ -105,6 +105,7 @@ gptoss:
 llama:
   fp4:
     b200:
+      # fix: add TP=2,4 to B200, just as mi355 has
       1k1k:
       - { tp: 1, conc: { start: 64, end: 64 } }
       - { tp: 2, conc: { start: 32, end: 64 } }
@@ -121,6 +122,8 @@ llama:
       - { tp: 4, conc: { start: 4, end: 32 } }
       - { tp: 8, conc: { start: 4, end: 8 } }
     b200-trt:
+      # fix: add TP=2,4 to B200, just as mi355 has
+      # B200 can achieve TPS/User >= 30 with larger concurrency till 128
       1k1k:
       - { tp: 1, conc: { start: 128, end: 128 } }
       - { tp: 2, conc: { start: 64, end: 128 } }
@@ -172,6 +175,7 @@ llama:
       - { tp: [ 1, 2 ], conc: { start: 16, end: 64 } }
       - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
     h200-trt:
+      # H200 can achieve TPS/User >= 30 with larger concurrency till 128
       1k1k:
       - { tp: 1, conc: { start: 128, end: 128 } }
       - { tp: 2, conc: { start: 64, end: 128 } }
@@ -187,6 +191,7 @@ llama:
       - { tp: 4, conc: { start: 4, end: 128 } }
       - { tp: 8, conc: { start: 4, end: 32 } }
     b200:
+      # fix: add TP=2,4 to B200, just as mi355 has
       1k1k:
       - { tp: 1, conc: { start: 64, end: 64 } }
       - { tp: 2, conc: { start: 32, end: 64 } }
@@ -201,6 +206,8 @@ llama:
       - { tp: 4, conc: { start: 4, end: 64 } }
       - { tp: 8, conc: { start: 4, end: 32 } }
     b200-trt:
+      # fix: add TP=2,4 to B200, just as mi355 has
+      # B200 can achieve TPS/User >= 30 with larger concurrency till 256
       1k1k:
       - { tp: 1, conc: { start: 128, end: 128 } }
       - { tp: 2, conc: { start: 64, end: 128 } }
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index 215de86e5..b78e1ecfb 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -27,7 +27,7 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO
 
 hf download $MODEL
 
-# # ========= Determine MOE_BACKEND based on ISL, OSL, CONC =========
+# ========= Determine MOE_BACKEND based on ISL, OSL, CONC =========
 # Default
 MOE_BACKEND="TRTLLM"
 

From e2d1a4c123275e20956ca7491e79676af958a63e Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 11:23:13 -0500
Subject: [PATCH 04/17] adding tests

---
 .github/workflows/benchmark-tmpl.yml |   3 +-
 utils/flatten_matrix.py              | 141 ++++----
 utils/test_flatten_matrix.py         | 469 +++++++++++++++++++++++++++
 3 files changed, 554 insertions(+), 59 deletions(-)
 create mode 100644 utils/test_flatten_matrix.py

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index a027bda8b..bcc96983e 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -70,7 +70,8 @@ jobs:
       fail-fast: false
       matrix:
         config: ${{ fromJson(needs.flatten-search-space-matrix.outputs.flattened-matrix) }}
-    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.config.tp }} conc${{ matrix.config.conc }}'
+        
+    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}'
 
     env:
       TP: ${{ matrix.config.tp }}
diff --git a/utils/flatten_matrix.py b/utils/flatten_matrix.py
index a46bce549..7c75058d7 100644
--- a/utils/flatten_matrix.py
+++ b/utils/flatten_matrix.py
@@ -2,72 +2,97 @@
 import os
 import json
 
-with open('.github/configs/search-space.yml', 'r') as f:
-    search_space = yaml.safe_load(f)
-
-seq_len_map = {
-    '1024': '1k',
-    '8192': '8k',
-}
-
-runner = os.environ['RUNNER']
-model = os.environ['MODEL']
-isl = os.environ['ISL']
-osl = os.environ['OSL']
-precision = os.environ['PRECISION']
-
-# In the workflows, not all model references are the same so do a sort
-# of partial matching to map to the search space keys
-model_map = {
-    'gpt-oss': 'gptoss',
-    'llama-3.3-70b-instruct': 'llama',
-    'deepseek-r1-0528': 'dsr1',
-}
-
-model_key = None
-model_lower = model.lower()
-for key, value in model_map.items():
-    if key.lower() in model_lower:
-        model_key = value
-        break
-
-if model_key is None:
-    raise ValueError(f"Model '{model}' is not recognized.")
-
-seq_len = f"{seq_len_map[isl]}{seq_len_map[osl]}"
-
-if isinstance(search_space, list):
-    entries = search_space
-else:
+def flatten_search_space(config_path, runner, model, isl, osl, precision):
+    with open(config_path, 'r') as f:
+        search_space = yaml.safe_load(f)
+
+    seq_len_map = {
+        '1024': '1k',
+        '8192': '8k',
+    }
+
+    model_map = {
+        'gpt-oss': 'gptoss',
+        'llama-3.3-70b-instruct': 'llama',
+        'deepseek-r1-0528': 'dsr1',
+    }
+
+    model_key = None
+    model_lower = model.lower()
+    for key, value in model_map.items():
+        if key.lower() in model_lower:
+            model_key = value
+            break
+
+    assert model_key, f"model '{model}' not recognized"
+
+    assert seq_len_map.get(isl) and seq_len_map.get(osl), f"either isl or osl not recognized"
+    seq_len = f"{seq_len_map[isl]}{seq_len_map[osl]}"
+
+    assert search_space.get(model_key, {}).get(precision), f"precision '{precision}' not recognized"
+    assert search_space.get(model_key, {}).get(precision).get(runner), f"runner '{runner}' not recognized"
+
     entries = search_space.get(model_key, {}).get(
-        precision, {}).get(runner, {}).get(seq_len, [])
+        precision).get(runner, {}).get(seq_len, [])
 
-flattened_search_space = []
-for entry in entries:
-    tp_list = entry['tp'] if isinstance(
-        entry.get('tp'), list) else [entry.get('tp')]
+    flattened_search_space = []
+    for entry in entries:
+        assert entry.get('tp'), f"entry malformed, expecting field 'tp'"
+        tp = entry.get('tp')
+        assert isinstance(tp, int) or (isinstance(tp, list) and all(isinstance(x, int) for x in tp)), \
+            f"entry malformed, expecting field 'tp' to be either an int or list of ints"
+        
+        tp_list = entry['tp'] if isinstance(entry['tp'], list) else [entry['tp']]
 
-    conc_config = entry.get('conc')
-    if isinstance(conc_config, dict):
+        conc_config = entry.get('conc')
+        
+        assert conc_config, f"entry malformed, missing field 'conc'"
+        assert isinstance(conc_config, dict), f"entry malformed, 'conc' must be a dict"
+        assert 'start' in conc_config, f"entry malformed, 'conc' missing required field 'start'"
+        assert 'end' in conc_config, f"entry malformed, 'conc' missing required field 'end'"
+        assert isinstance(conc_config['start'], int), f"entry malformed, 'conc.start' must be an int"
+        assert isinstance(conc_config['end'], int), f"entry malformed, 'conc.end' must be an int"
+        assert conc_config['start'] <= conc_config['end'], f"entry malformed, 'conc.start' must be <= 'conc.end'"
+        
         start = conc_config['start']
         end = conc_config['end']
         step_factor = conc_config.get('step', 2)
+        
+        if 'step' in conc_config:
+            assert isinstance(step_factor, int), f"entry malformed, 'conc.step' must be an int"
+            assert step_factor > 1, f"entry malformed, 'conc.step' must be > 1"
+        
         conc_list = []
         current = start
         while current <= end:
             conc_list.append(current)
             current *= step_factor
-    elif isinstance(conc_config, list):
-        conc_list = conc_config
-    else:
-        conc_list = [conc_config]
-
-    for tp_value in tp_list:
-        for conc_value in conc_list:
-            new_entry = entry.copy()
-            new_entry['tp'] = tp_value
-            new_entry['conc'] = conc_value
-            flattened_search_space.append(new_entry)
-
-with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
-    f.write(f"flattened-matrix={json.dumps(flattened_search_space)}\n")
+
+        for tp_value in tp_list:
+            for conc_value in conc_list:
+                new_entry = entry.copy()
+                new_entry['tp'] = tp_value
+                new_entry['conc'] = conc_value
+                flattened_search_space.append(new_entry)
+
+    return flattened_search_space
+
+
+def main():
+    config_path = '.github/configs/search-space.yml'
+    runner = os.environ['RUNNER']
+    model = os.environ['MODEL']
+    isl = os.environ['ISL']
+    osl = os.environ['OSL']
+    precision = os.environ['PRECISION']
+    
+    flattened_search_space = flatten_search_space(
+        config_path, runner, model, isl, osl, precision
+    )
+    
+    with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
+        f.write(f"flattened-matrix={json.dumps(flattened_search_space)}\n")
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/utils/test_flatten_matrix.py b/utils/test_flatten_matrix.py
new file mode 100644
index 000000000..b4f0620ad
--- /dev/null
+++ b/utils/test_flatten_matrix.py
@@ -0,0 +1,469 @@
+import pytest
+import yaml
+from flatten_matrix import flatten_search_space
+
+
+@pytest.fixture
+def minimal_config():
+    """Minimal valid config for testing"""
+    return {
+        'gptoss': {
+            'fp4': {
+                'h100': {
+                    '1k1k': [
+                        {'tp': [2, 4], 'conc': {'start': 4, 'end': 8}}
+                    ]
+                }
+            }
+        },
+        'llama': {
+            'fp8': {
+                'b200': {
+                    '1k8k': [
+                        {'tp': 2, 'conc': {'start': 4, 'end': 64, 'step': 4}}
+                    ]
+                }
+            }
+        },
+        'dsr1': {
+            'fp4': {
+                'b200-trt': {
+                    '8k1k': [
+                        {'tp': 4, 'conc': {'start': 4, 'end': 32}},
+                        {'tp': 4, 'ep': 4, 'dp_attention': 'true', 'conc': {'start': 64, 'end': 256}}
+                    ]
+                }
+            }
+        }
+    }
+
+
+@pytest.fixture
+def config_file(minimal_config, tmp_path):
+    # temp config file
+    config_path = tmp_path / "search-space.yml"
+    with open(config_path, 'w') as f:
+        yaml.dump(minimal_config, f)
+    return config_path
+
+
+class TestValidCases:
+    """Test valid input scenarios"""
+    
+    def test_single_tp_value(self, config_file):
+        """Test with single TP value"""
+        result = flatten_search_space(
+            config_file, 'b200', 'llama-3.3-70b-instruct', '1024', '8192', 'fp8'
+        )
+        
+        # Should generate: tp=2, conc=[4, 16, 64] with step=4
+        assert len(result) == 3
+        assert all(entry['tp'] == 2 for entry in result)
+        assert [entry['conc'] for entry in result] == [4, 16, 64]
+    
+    def test_list_of_tp_values(self, config_file):
+        """Test with list of TP values"""
+        result = flatten_search_space(
+            config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp4'
+        )
+        
+        # Should generate: tp=[2,4], conc=[4,8] = 2*2 = 4 combinations
+        assert len(result) == 4
+        tp_values = [entry['tp'] for entry in result]
+        assert tp_values.count(2) == 2
+        assert tp_values.count(4) == 2
+    
+    def test_optional_fields_preserved(self, config_file):
+        """Test that optional fields like ep and dp_attention are preserved"""
+        result = flatten_search_space(
+            config_file, 'b200-trt', 'deepseek-r1-0528', '8192', '1024', 'fp4'
+        )
+        
+        # Second entry should have ep and dp_attention
+        entries_with_ep = [e for e in result if 'ep' in e]
+        assert len(entries_with_ep) > 0
+        assert all(e['ep'] == 4 for e in entries_with_ep)
+        
+        entries_with_dp = [e for e in result if 'dp_attention' in e]
+        assert len(entries_with_dp) > 0
+        assert all(e['dp_attention'] == 'true' for e in entries_with_dp)
+    
+    def test_default_step_factor(self, config_file):
+        """Test that default step factor of 2 is used when not specified"""
+        result = flatten_search_space(
+            config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp4'
+        )
+        
+        # conc: start=4, end=8, default step=2 -> [4, 8]
+        conc_values = sorted(set(entry['conc'] for entry in result))
+        assert conc_values == [4, 8]
+    
+    def test_custom_step_factor(self, config_file):
+        """Test custom step factor"""
+        result = flatten_search_space(
+            config_file, 'b200', 'llama-3.3-70b-instruct', '1024', '8192', 'fp8'
+        )
+        
+        # conc: start=4, end=64, step=4 -> [4, 16, 64]
+        conc_values = sorted(set(entry['conc'] for entry in result))
+        assert conc_values == [4, 16, 64]
+
+
+class TestModelMapping:
+    """Test model name mapping"""
+    
+    def test_gptoss_mapping(self, config_file):
+        """Test gpt-oss maps to gptoss"""
+        result = flatten_search_space(
+            config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp4'
+        )
+        assert len(result) > 0
+    
+    def test_llama_mapping(self, config_file):
+        """Test llama mapping with case insensitivity"""
+        result = flatten_search_space(
+            config_file, 'b200', 'LLAMA-3.3-70B-INSTRUCT', '1024', '8192', 'fp8'
+        )
+        assert len(result) > 0
+    
+    def test_dsr1_mapping(self, config_file):
+        """Test deepseek-r1 maps to dsr1"""
+        result = flatten_search_space(
+            config_file, 'b200-trt', 'deepseek-r1-0528', '8192', '1024', 'fp4'
+        )
+        assert len(result) > 0
+
+
+class TestInvalidInputs:
+    """Test error handling for invalid inputs"""
+    
+    def test_unrecognized_model(self, config_file):
+        """Test error for unrecognized model"""
+        with pytest.raises(AssertionError, match="model .* not recognized"):
+            flatten_search_space(
+                config_file, 'h100', 'unknown-model', '1024', '1024', 'fp4'
+            )
+    
+    def test_invalid_isl(self, config_file):
+        """Test error for invalid ISL"""
+        with pytest.raises(AssertionError, match="either isl or osl not recognized"):
+            flatten_search_space(
+                config_file, 'h100', 'gpt-oss', '2048', '1024', 'fp4'
+            )
+    
+    def test_invalid_osl(self, config_file):
+        """Test error for invalid OSL"""
+        with pytest.raises(AssertionError, match="either isl or osl not recognized"):
+            flatten_search_space(
+                config_file, 'h100', 'gpt-oss', '1024', '4096', 'fp4'
+            )
+    
+    def test_invalid_precision(self, config_file):
+        """Test error for invalid precision"""
+        with pytest.raises(AssertionError, match="precision .* not recognized"):
+            flatten_search_space(
+                config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp16'
+            )
+    
+    def test_invalid_runner(self, config_file):
+        """Test error for invalid runner"""
+        with pytest.raises(AssertionError, match="runner .* not recognized"):
+            flatten_search_space(
+                config_file, 'a100', 'gpt-oss', '1024', '1024', 'fp4'
+            )
+
+
+class TestMalformedEntries:
+    """Test validation of malformed config entries"""
+    
+    def test_missing_tp_field(self, tmp_path):
+        """Test error when tp field is missing"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'conc': {'start': 4, 'end': 8}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="entry malformed, expecting field 'tp'"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_tp_wrong_type(self, tmp_path):
+        """Test error when tp is wrong type"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 'invalid', 'conc': {'start': 4, 'end': 8}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="expecting field 'tp' to be either an int or list of ints"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_tp_list_with_non_ints(self, tmp_path):
+        """Test error when tp list contains non-integers"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': [2, 'four', 8], 'conc': {'start': 4, 'end': 8}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="expecting field 'tp' to be either an int or list of ints"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_missing_conc_field(self, tmp_path):
+        """Test error when conc field is missing"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="entry malformed, missing field 'conc'"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_conc_not_dict(self, tmp_path):
+        """Test error when conc is not a dict"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': [4, 8, 16]}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc' must be a dict"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_conc_missing_start(self, tmp_path):
+        """Test error when conc.start is missing"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'end': 8}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc' missing required field 'start'"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_conc_missing_end(self, tmp_path):
+        """Test error when conc.end is missing"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'start': 4}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc' missing required field 'end'"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_conc_start_not_int(self, tmp_path):
+        """Test error when conc.start is not an int"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'start': '4', 'end': 8}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc.start' must be an int"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_conc_end_not_int(self, tmp_path):
+        """Test error when conc.end is not an int"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'start': 4, 'end': '8'}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc.end' must be an int"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_conc_start_greater_than_end(self, tmp_path):
+        """Test error when conc.start > conc.end"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'start': 16, 'end': 4}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc.start' must be <= 'conc.end'"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_step_not_int(self, tmp_path):
+        """Test error when step is not an int"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'start': 4, 'end': 16, 'step': '2'}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc.step' must be an int"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_step_not_greater_than_one(self, tmp_path):
+        """Test error when step <= 1"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'start': 4, 'end': 16, 'step': 1}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc.step' must be > 1"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+
+
+class TestConcurrencyGeneration:
+    """Test concurrency value generation logic"""
+    
+    def test_geometric_progression(self, tmp_path):
+        """Test that concurrency values follow geometric progression"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 1, 'conc': {'start': 4, 'end': 64, 'step': 2}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        result = flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+        
+        conc_values = [entry['conc'] for entry in result]
+        assert conc_values == [4, 8, 16, 32, 64]
+    
+    def test_single_conc_value(self, tmp_path):
+        """Test when start equals end"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 1, 'conc': {'start': 64, 'end': 64}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        result = flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+        
+        assert len(result) == 1
+        assert result[0]['conc'] == 64
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
\ No newline at end of file

From 731300ced6295356221960a96f7db63f674ec7fe Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 11:24:38 -0500
Subject: [PATCH 05/17] removing extraneous whitespace

---
 benchmarks/dsr1_fp8_h200_trt_slurm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 3552fac78..5dfdf8617 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -32,7 +32,7 @@ cat > $EXTRA_CONFIG_FILE << EOF
 cuda_graph_config:
     enable_padding: true
     max_batch_size: 128
- enable_attention_dp: $DP_ATTENTION
+enable_attention_dp: $DP_ATTENTION
 print_iter_log: true
 kv_cache_config:
     dtype: fp8

From d44695e2795c08e8dbfa3f8b5f98302b98645302 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 11:49:17 -0500
Subject: [PATCH 06/17] addings docs for new config

---
 .github/configs/CONFIGS.md | 80 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 80 insertions(+)
 create mode 100644 .github/configs/CONFIGS.md

diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md
new file mode 100644
index 000000000..35b804804
--- /dev/null
+++ b/.github/configs/CONFIGS.md
@@ -0,0 +1,80 @@
+# Search Space Configuration
+
+This file defines which benchmark configurations to run for each model, GPU, and precision combination.
+
+## Quick Start
+
+Add a new configuration by following this pattern:
+```yaml
+model_name:
+  precision:
+    gpu_type:
+      sequence_length:
+        - { tp: [1, 2, 4], conc: { start: 4, end: 64 } }
+```
+
+## Field Definitions
+
+### Required Fields
+
+- **`tp`**: Tensor Parallelism (number of GPUs)
+  - Single value: `tp: 4`
+  - Multiple values: `tp: [2, 4, 8]`
+
+- **`conc`**: Concurrency (number of simultaneous requests)
+  - `start`: First value to test
+  - `end`: Last value to test
+  - `step`: Multiplier (default: 2)
+  - Example: `{start: 4, end: 64}` → tests [4, 8, 16, 32, 64]
+
+### Optional Fields
+
+- **`ep`**: Expert Parallelism for MoE models (default: 1)
+
+- **`dp_attention`**: Data Parallel Attention (default: `"false"`)
+
+## Examples
+
+### Basic configuration
+```yaml
+gptoss:
+  fp4:
+    h100:
+      1k1k:  # 1024 input, 1024 output
+        - { tp: [2, 4, 8], conc: { start: 4, end: 64 } }
+```
+This tests 15 combinations: 3 TP values × 5 concurrency values
+
+### Configuration with optional fields
+```yaml
+dsr1:
+  fp4:
+    b200-trt:
+      1k1k:
+        - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 256, end: 256 } }
+```
+
+### Custom step factor
+```yaml
+llama:
+  fp8:
+    b200:
+      1k8k:
+        - { tp: 2, conc: { start: 4, end: 64, step: 4 } }
+```
+This tests [4, 16, 64] (multiplies by 4 instead of default 2)
+
+## Key Points
+
+1. **Models**: `gptoss`, `llama`, `dsr1`
+2. **Precisions**: `fp4`, `fp8`
+3. **Sequence lengths**: `1k1k`, `1k8k`, `8k1k` (input×output)
+4. Each entry expands to test all combinations of TP and concurrency values
+5. Use `-trt` suffix for TensorRT-optimized hardware configs
+
+## Testing Your Changes
+
+Run the flattening script to validate:
+```bash
+python utils/flatten_matrix.py
+```
\ No newline at end of file

From a148f00bf74cf6912dae828ba6bacc6c055143cf Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 12:10:57 -0500
Subject: [PATCH 07/17] adding summarization and result changes

---
 .github/configs/CONFIGS.md                    |  2 +-
 .../workflows/benchmark-multinode-tmpl.yml    |  4 +-
 .github/workflows/benchmark-tmpl.yml          |  4 +-
 utils/count_num_jobs.py                       | 37 +++++++++++++++++++
 utils/process_result.py                       | 14 ++++---
 utils/summarize.py                            |  8 ++--
 6 files changed, 57 insertions(+), 12 deletions(-)
 create mode 100644 utils/count_num_jobs.py

diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md
index 35b804804..9021ec83e 100644
--- a/.github/configs/CONFIGS.md
+++ b/.github/configs/CONFIGS.md
@@ -70,7 +70,7 @@ This tests [4, 16, 64] (multiplies by 4 instead of default 2)
 2. **Precisions**: `fp4`, `fp8`
 3. **Sequence lengths**: `1k1k`, `1k8k`, `8k1k` (input×output)
 4. Each entry expands to test all combinations of TP and concurrency values
-5. Use `-trt` suffix for TensorRT-optimized hardware configs
+5. There are comments throughout the yaml that were ported over from bash scripts describing what parallelism settings should be set depending on concurrency -- keep an eye out for those.
 
 ## Testing Your Changes
 
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 07f5b876d..b5bcc5817 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -84,6 +84,8 @@ jobs:
             exit 1
           fi
 
+        # NOTE: https://github.com/InferenceMAX/InferenceMAX/pull/111 adds EP_SIZE and DP_ATTENTION parsing to the process_results.py script
+        # but it is not yet implemented for GB200 multi-node, therefore just default to: 1 "false"
       - name: Process results
         run: |
           # Process each result file
@@ -93,7 +95,7 @@ jobs:
               # Extract GPU count from filename for tp_size calculation
               gpus=$(echo "$result_file" | sed "s/.*_gpus\([0-9]*\)\.json/\1/")
               if [ -n "$gpus" ]; then
-                python3 utils/process_result.py ${{ inputs.runner }} $gpus ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE
+                python3 utils/process_result.py ${{ inputs.runner }} $gpus 1 "false" ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE
               fi
             fi
           done
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index bcc96983e..901871254 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -70,7 +70,7 @@ jobs:
       fail-fast: false
       matrix:
         config: ${{ fromJson(needs.flatten-search-space-matrix.outputs.flattened-matrix) }}
-        
+
     name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}'
 
     env:
@@ -153,7 +153,7 @@ jobs:
 
       - name: Process result
         run: |
-          python3 utils/process_result.py ${{ inputs.runner }} $TP $RESULT_FILENAME $FRAMEWORK $PRECISION
+          python3 utils/process_result.py ${{ inputs.runner }} $TP $EP_SIZE $DP_ATTENTION $RESULT_FILENAME $FRAMEWORK $PRECISION
 
       - name: Upload result
         uses: actions/upload-artifact@v4
diff --git a/utils/count_num_jobs.py b/utils/count_num_jobs.py
new file mode 100644
index 000000000..776fd2a65
--- /dev/null
+++ b/utils/count_num_jobs.py
@@ -0,0 +1,37 @@
+import yaml
+from collections import defaultdict
+
+with open('.github/configs/search-space.yml', 'r') as f:
+    data = yaml.safe_load(f)
+
+gpu_totals = defaultdict(int)
+overall_total = 0
+
+for model in data.values():
+    for precision in model.values():
+        for gpu, runner_data in precision.items():
+            for seq_len in runner_data.values():
+                for entry in seq_len:
+                    # Count TP values
+                    tp_list = entry['tp'] if isinstance(entry['tp'], list) else [entry['tp']]
+                    tp_count = len(tp_list)
+                    
+                    # Count CONC values
+                    conc = entry['conc']
+                    start, end = conc['start'], conc['end']
+                    step = conc.get('step', 2)
+                    
+                    conc_count = 0
+                    current = start
+                    while current <= end:
+                        conc_count += 1
+                        current *= step
+                    
+                    combo_count = tp_count * conc_count
+                    gpu_totals[gpu] += combo_count
+                    overall_total += combo_count
+
+print("Breakdown by GPU:")
+for gpu in sorted(gpu_totals.keys()):
+    print(f"  {gpu}: {gpu_totals[gpu]}")
+print(f"\nTotal combinations: {overall_total}")
\ No newline at end of file
diff --git a/utils/process_result.py b/utils/process_result.py
index 89c4aa7b3..a59d1f7f3 100644
--- a/utils/process_result.py
+++ b/utils/process_result.py
@@ -5,9 +5,11 @@
 
 hw = sys.argv[1]
 tp_size = int(sys.argv[2])
-result_filename = sys.argv[3]
-framework = sys.argv[4]
-precision = sys.argv[5]
+ep_size = int(sys.argv[3])
+dp_attention = sys.argv[4]
+result_filename = sys.argv[5]
+framework = sys.argv[6]
+precision = sys.argv[7]
 
 with open(f'{result_filename}.json') as f:
     bmk_result = json.load(f)
@@ -15,7 +17,9 @@
 data = {
     'hw': hw,
     'tp': tp_size,
+    'ep': ep_size,
     'conc': int(bmk_result['max_concurrency']),
+    'dp_attention': dp_attention, # true or false
     'model': bmk_result['model_id'],
     'framework': framework,
     'precision': precision,
@@ -23,8 +27,8 @@
     'output_tput_per_gpu': float(bmk_result['output_throughput']) / tp_size
 }
 
-if len(sys.argv) == 7:  # MTP
-    data['mtp'] = sys.argv[6]
+if len(sys.argv) == 9:  # MTP
+    data['mtp'] = sys.argv[8]
 
 for key, value in bmk_result.items():
     if key.endswith('ms'):
diff --git a/utils/summarize.py b/utils/summarize.py
index 1f78caf9c..546a13757 100644
--- a/utils/summarize.py
+++ b/utils/summarize.py
@@ -9,11 +9,11 @@
     with open(result_path) as f:
         result = json.load(f)
     results.append(result)
-results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['conc']))
+results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['ep'], r['conc']))
 
 summary_header = f'''\
-| Hardware | Framework | Precision | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
-| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\
+| Hardware | Framework | Precision | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
+| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\
 '''
 print(summary_header)
 
@@ -25,7 +25,9 @@
         f"| {framework.upper()} "
         f"| {precision.upper()} "
         f"| {result['tp']} "
+        f"| {result['ep']} "
         f"| {result['conc']} "
+        f"| {result['dp_attention']} "
         f"| {(result['median_ttft'] * 1000):.4f} "
         f"| {(result['median_tpot'] * 1000):.4f} "
         f"| {result['median_e2el']:.4f} "

From 3540e4cc4cbd428cbc371200d91bca77c8dd3196 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 12:14:13 -0500
Subject: [PATCH 08/17] TEST update workflow for testing

---
 .github/workflows/dsr1-tmpl.yml | 30 +++++++++++++++---------------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml
index 64dfe4ec4..c8b7fd64d 100644
--- a/.github/workflows/dsr1-tmpl.yml
+++ b/.github/workflows/dsr1-tmpl.yml
@@ -40,21 +40,21 @@ on:
         default: false
 
 jobs:
-  bmk-h200-fp8:
-    if: ${{ inputs.use_h200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: h200
-      image: 'lmsysorg/sglang:v0.5.2rc2-cu126'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      framework: 'sglang'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
+  # bmk-h200-fp8:
+  #   if: ${{ inputs.use_h200 }}
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     runner: h200
+  #     image: 'lmsysorg/sglang:v0.5.2rc2-cu126'
+  #     model: 'deepseek-ai/DeepSeek-R1-0528'
+  #     framework: 'sglang'
+  #     precision: 'fp8'
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
 
   bmk-h200-trt-fp8:
     if: ${{ inputs.use_h200 }}

From 01fc80ed16f4d7eff1bc0d025c64898b04aa91d2 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 12:15:33 -0500
Subject: [PATCH 09/17] TEST update workflow for testing

---
 .github/workflows/full-sweep-tmpl.yml | 92 +++++++++++++--------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml
index b086460df..e739c9aa4 100644
--- a/.github/workflows/full-sweep-tmpl.yml
+++ b/.github/workflows/full-sweep-tmpl.yml
@@ -37,30 +37,30 @@ on:
         default: false
 
 jobs:
-  _70b-1k1k:
-    if: ${{ inputs.run_1k1k }}
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
+  # _70b-1k1k:
+  #   if: ${{ inputs.run_1k1k }}
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_1k1k'
+  #     isl: 1024
+  #     osl: 1024
+  #     max-model-len: 2048
+  #     random-range-ratio: 0.8
+  #     use_h100: ${{ inputs.use_h100 }}
+  #     use_h200: ${{ inputs.use_h200 }}
+  #     use_b200: ${{ inputs.use_b200 }}
+  #     use_mi300x: ${{ inputs.use_mi300x }}
+  #     use_mi325x: ${{ inputs.use_mi325x }}
+  #     use_mi355x: ${{ inputs.use_mi355x }}
 
-  collect-70b-1k1k-results:
-    needs: _70b-1k1k
-    if: ${{ inputs.run_1k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_1k1k'
+  # collect-70b-1k1k-results:
+  #   needs: _70b-1k1k
+  #   if: ${{ inputs.run_1k1k && always() }}
+  #   uses: ./.github/workflows/collect-results.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_1k1k'
 
   dsr1-1k1k:
     if: ${{ inputs.run_1k1k }}
@@ -87,30 +87,30 @@ jobs:
     with:
       exp-name: 'dsr1_1k1k'
 
-  gptoss-1k1k:
-    if: ${{ inputs.run_1k1k }}
-    uses: ./.github/workflows/gptoss-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_1k1k'
-      isl: 1024
-      osl: 1024
-      max-model-len: 2048
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
+  # gptoss-1k1k:
+  #   if: ${{ inputs.run_1k1k }}
+  #   uses: ./.github/workflows/gptoss-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'gptoss_1k1k'
+  #     isl: 1024
+  #     osl: 1024
+  #     max-model-len: 2048
+  #     random-range-ratio: 0.8
+  #     use_h100: ${{ inputs.use_h100 }}
+  #     use_h200: ${{ inputs.use_h200 }}
+  #     use_b200: ${{ inputs.use_b200 }}
+  #     use_mi300x: ${{ inputs.use_mi300x }}
+  #     use_mi325x: ${{ inputs.use_mi325x }}
+  #     use_mi355x: ${{ inputs.use_mi355x }}
 
-  collect-gptoss-1k1k-results:
-    needs: gptoss-1k1k
-    if: ${{ inputs.run_1k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_1k1k'
+  # collect-gptoss-1k1k-results:
+  #   needs: gptoss-1k1k
+  #   if: ${{ inputs.run_1k1k && always() }}
+  #   uses: ./.github/workflows/collect-results.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'gptoss_1k1k'
 
   _70b-8k1k:
     if: ${{ inputs.run_8k1k }}

From de36f9a8ae43fa0286794cd2a8c623cee009b225 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 12:36:42 -0500
Subject: [PATCH 10/17] fix incorrect formatting for summary

---
 utils/summarize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/summarize.py b/utils/summarize.py
index 546a13757..de8863c78 100644
--- a/utils/summarize.py
+++ b/utils/summarize.py
@@ -12,7 +12,7 @@
 results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['ep'], r['conc']))
 
 summary_header = f'''\
-| Hardware | Framework | Precision | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
+| Hardware | Framework | Precision | TP | EP | Conc | DP Attention | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
 | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\
 '''
 print(summary_header)

From 010866713884604e79874aaf0ad9f36e5afe669d Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 12:38:14 -0500
Subject: [PATCH 11/17] TEST update workflow for testing

---
 .github/workflows/full-sweep-tmpl.yml | 92 +++++++++++++--------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml
index e739c9aa4..a612256d4 100644
--- a/.github/workflows/full-sweep-tmpl.yml
+++ b/.github/workflows/full-sweep-tmpl.yml
@@ -112,30 +112,30 @@ jobs:
   #   with:
   #     exp-name: 'gptoss_1k1k'
 
-  _70b-8k1k:
-    if: ${{ inputs.run_8k1k }}
-    uses: ./.github/workflows/70b-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
+  # _70b-8k1k:
+  #   if: ${{ inputs.run_8k1k }}
+  #   uses: ./.github/workflows/70b-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+  #     use_h100: ${{ inputs.use_h100 }}
+  #     use_h200: ${{ inputs.use_h200 }}
+  #     use_b200: ${{ inputs.use_b200 }}
+  #     use_mi300x: ${{ inputs.use_mi300x }}
+  #     use_mi325x: ${{ inputs.use_mi325x }}
+  #     use_mi355x: ${{ inputs.use_mi355x }}
 
-  collect-70b-8k1k-results:
-    needs: _70b-8k1k
-    if: ${{ inputs.run_8k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: '70b_8k1k'
+  # collect-70b-8k1k-results:
+  #   needs: _70b-8k1k
+  #   if: ${{ inputs.run_8k1k && always() }}
+  #   uses: ./.github/workflows/collect-results.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: '70b_8k1k'
 
   dsr1-8k1k:
     if: ${{ inputs.run_8k1k }}
@@ -162,30 +162,30 @@ jobs:
     with:
       exp-name: 'dsr1_8k1k'
 
-  gptoss-8k1k:
-    if: ${{ inputs.run_8k1k }}
-    uses: ./.github/workflows/gptoss-tmpl.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_8k1k'
-      isl: 8192
-      osl: 1024
-      max-model-len: 9216
-      random-range-ratio: 0.8
-      use_h100: ${{ inputs.use_h100 }}
-      use_h200: ${{ inputs.use_h200 }}
-      use_b200: ${{ inputs.use_b200 }}
-      use_mi300x: ${{ inputs.use_mi300x }}
-      use_mi325x: ${{ inputs.use_mi325x }}
-      use_mi355x: ${{ inputs.use_mi355x }}
+  # gptoss-8k1k:
+  #   if: ${{ inputs.run_8k1k }}
+  #   uses: ./.github/workflows/gptoss-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'gptoss_8k1k'
+  #     isl: 8192
+  #     osl: 1024
+  #     max-model-len: 9216
+  #     random-range-ratio: 0.8
+  #     use_h100: ${{ inputs.use_h100 }}
+  #     use_h200: ${{ inputs.use_h200 }}
+  #     use_b200: ${{ inputs.use_b200 }}
+  #     use_mi300x: ${{ inputs.use_mi300x }}
+  #     use_mi325x: ${{ inputs.use_mi325x }}
+  #     use_mi355x: ${{ inputs.use_mi355x }}
 
-  collect-gptoss-8k1k-results:
-    needs: gptoss-8k1k
-    if: ${{ inputs.run_8k1k && always() }}
-    uses: ./.github/workflows/collect-results.yml
-    secrets: inherit
-    with:
-      exp-name: 'gptoss_8k1k'
+  # collect-gptoss-8k1k-results:
+  #   needs: gptoss-8k1k
+  #   if: ${{ inputs.run_8k1k && always() }}
+  #   uses: ./.github/workflows/collect-results.yml
+  #   secrets: inherit
+  #   with:
+  #     exp-name: 'gptoss_8k1k'
 
   _70b-1k8k:
     if: ${{ inputs.run_1k8k }}

From 2f015cc3988c187daa85a2e4a64d5140cae5fdbb Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 12:42:24 -0500
Subject: [PATCH 12/17] timeout minutes for flatten job

---
 .github/workflows/benchmark-tmpl.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 901871254..f6460f930 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -64,7 +64,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     needs: flatten-search-space-matrix
-    timeout-minutes: 180
+    timeout-minutes: 2
 
     strategy:
       fail-fast: false

From 96c128e1dc5ef1fd1b471471d6cce143b12be31a Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 12:51:53 -0500
Subject: [PATCH 13/17] remove debug

---
 .github/workflows/benchmark-tmpl.yml | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index f6460f930..77df1aee8 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -80,9 +80,6 @@ jobs:
       DP_ATTENTION: ${{ matrix.config.dp_attention || 'false' }}
 
     steps:
-      - name: debug
-        run: echo "${{ matrix.config }}"
-
       - name: Resource cleanup
         run: |
           if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then

From 4aadd32f7d8450df457fd74c7ca2c944637aea3c Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 13:05:59 -0500
Subject: [PATCH 14/17] remove debug

---
 .github/workflows/full-sweep-tmpl.yml | 184 +++++++++++++-------------
 1 file changed, 92 insertions(+), 92 deletions(-)

diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml
index a612256d4..b086460df 100644
--- a/.github/workflows/full-sweep-tmpl.yml
+++ b/.github/workflows/full-sweep-tmpl.yml
@@ -37,30 +37,30 @@ on:
         default: false
 
 jobs:
-  # _70b-1k1k:
-  #   if: ${{ inputs.run_1k1k }}
-  #   uses: ./.github/workflows/70b-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: '70b_1k1k'
-  #     isl: 1024
-  #     osl: 1024
-  #     max-model-len: 2048
-  #     random-range-ratio: 0.8
-  #     use_h100: ${{ inputs.use_h100 }}
-  #     use_h200: ${{ inputs.use_h200 }}
-  #     use_b200: ${{ inputs.use_b200 }}
-  #     use_mi300x: ${{ inputs.use_mi300x }}
-  #     use_mi325x: ${{ inputs.use_mi325x }}
-  #     use_mi355x: ${{ inputs.use_mi355x }}
+  _70b-1k1k:
+    if: ${{ inputs.run_1k1k }}
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_1k1k'
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      use_h100: ${{ inputs.use_h100 }}
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
 
-  # collect-70b-1k1k-results:
-  #   needs: _70b-1k1k
-  #   if: ${{ inputs.run_1k1k && always() }}
-  #   uses: ./.github/workflows/collect-results.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: '70b_1k1k'
+  collect-70b-1k1k-results:
+    needs: _70b-1k1k
+    if: ${{ inputs.run_1k1k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_1k1k'
 
   dsr1-1k1k:
     if: ${{ inputs.run_1k1k }}
@@ -87,55 +87,55 @@ jobs:
     with:
       exp-name: 'dsr1_1k1k'
 
-  # gptoss-1k1k:
-  #   if: ${{ inputs.run_1k1k }}
-  #   uses: ./.github/workflows/gptoss-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: 'gptoss_1k1k'
-  #     isl: 1024
-  #     osl: 1024
-  #     max-model-len: 2048
-  #     random-range-ratio: 0.8
-  #     use_h100: ${{ inputs.use_h100 }}
-  #     use_h200: ${{ inputs.use_h200 }}
-  #     use_b200: ${{ inputs.use_b200 }}
-  #     use_mi300x: ${{ inputs.use_mi300x }}
-  #     use_mi325x: ${{ inputs.use_mi325x }}
-  #     use_mi355x: ${{ inputs.use_mi355x }}
+  gptoss-1k1k:
+    if: ${{ inputs.run_1k1k }}
+    uses: ./.github/workflows/gptoss-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: 'gptoss_1k1k'
+      isl: 1024
+      osl: 1024
+      max-model-len: 2048
+      random-range-ratio: 0.8
+      use_h100: ${{ inputs.use_h100 }}
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
 
-  # collect-gptoss-1k1k-results:
-  #   needs: gptoss-1k1k
-  #   if: ${{ inputs.run_1k1k && always() }}
-  #   uses: ./.github/workflows/collect-results.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: 'gptoss_1k1k'
+  collect-gptoss-1k1k-results:
+    needs: gptoss-1k1k
+    if: ${{ inputs.run_1k1k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: 'gptoss_1k1k'
 
-  # _70b-8k1k:
-  #   if: ${{ inputs.run_8k1k }}
-  #   uses: ./.github/workflows/70b-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: '70b_8k1k'
-  #     isl: 8192
-  #     osl: 1024
-  #     max-model-len: 9216
-  #     random-range-ratio: 0.8
-  #     use_h100: ${{ inputs.use_h100 }}
-  #     use_h200: ${{ inputs.use_h200 }}
-  #     use_b200: ${{ inputs.use_b200 }}
-  #     use_mi300x: ${{ inputs.use_mi300x }}
-  #     use_mi325x: ${{ inputs.use_mi325x }}
-  #     use_mi355x: ${{ inputs.use_mi355x }}
+  _70b-8k1k:
+    if: ${{ inputs.run_8k1k }}
+    uses: ./.github/workflows/70b-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_8k1k'
+      isl: 8192
+      osl: 1024
+      max-model-len: 9216
+      random-range-ratio: 0.8
+      use_h100: ${{ inputs.use_h100 }}
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
 
-  # collect-70b-8k1k-results:
-  #   needs: _70b-8k1k
-  #   if: ${{ inputs.run_8k1k && always() }}
-  #   uses: ./.github/workflows/collect-results.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: '70b_8k1k'
+  collect-70b-8k1k-results:
+    needs: _70b-8k1k
+    if: ${{ inputs.run_8k1k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: '70b_8k1k'
 
   dsr1-8k1k:
     if: ${{ inputs.run_8k1k }}
@@ -162,30 +162,30 @@ jobs:
     with:
       exp-name: 'dsr1_8k1k'
 
-  # gptoss-8k1k:
-  #   if: ${{ inputs.run_8k1k }}
-  #   uses: ./.github/workflows/gptoss-tmpl.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: 'gptoss_8k1k'
-  #     isl: 8192
-  #     osl: 1024
-  #     max-model-len: 9216
-  #     random-range-ratio: 0.8
-  #     use_h100: ${{ inputs.use_h100 }}
-  #     use_h200: ${{ inputs.use_h200 }}
-  #     use_b200: ${{ inputs.use_b200 }}
-  #     use_mi300x: ${{ inputs.use_mi300x }}
-  #     use_mi325x: ${{ inputs.use_mi325x }}
-  #     use_mi355x: ${{ inputs.use_mi355x }}
+  gptoss-8k1k:
+    if: ${{ inputs.run_8k1k }}
+    uses: ./.github/workflows/gptoss-tmpl.yml
+    secrets: inherit
+    with:
+      exp-name: 'gptoss_8k1k'
+      isl: 8192
+      osl: 1024
+      max-model-len: 9216
+      random-range-ratio: 0.8
+      use_h100: ${{ inputs.use_h100 }}
+      use_h200: ${{ inputs.use_h200 }}
+      use_b200: ${{ inputs.use_b200 }}
+      use_mi300x: ${{ inputs.use_mi300x }}
+      use_mi325x: ${{ inputs.use_mi325x }}
+      use_mi355x: ${{ inputs.use_mi355x }}
 
-  # collect-gptoss-8k1k-results:
-  #   needs: gptoss-8k1k
-  #   if: ${{ inputs.run_8k1k && always() }}
-  #   uses: ./.github/workflows/collect-results.yml
-  #   secrets: inherit
-  #   with:
-  #     exp-name: 'gptoss_8k1k'
+  collect-gptoss-8k1k-results:
+    needs: gptoss-8k1k
+    if: ${{ inputs.run_8k1k && always() }}
+    uses: ./.github/workflows/collect-results.yml
+    secrets: inherit
+    with:
+      exp-name: 'gptoss_8k1k'
 
   _70b-1k8k:
     if: ${{ inputs.run_1k8k }}

From 964c5c2367dd5ba4e6dd98b9f7fb020fec754760 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 13:10:10 -0500
Subject: [PATCH 15/17] add timeout to correct job

---
 .github/workflows/benchmark-tmpl.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 77df1aee8..ee58f4bb7 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -49,6 +49,7 @@ env:
 
 jobs:
   flatten-search-space-matrix:
+    timeout-minutes: 2
     runs-on: ubuntu-latest
     outputs:
       flattened-matrix: ${{ steps.flatten.outputs.flattened-matrix }}
@@ -64,7 +65,7 @@ jobs:
   benchmark:
     runs-on: ${{ inputs.runner }}
     needs: flatten-search-space-matrix
-    timeout-minutes: 2
+    timeout-minutes: 180
 
     strategy:
       fail-fast: false

From b0ff2bad6de3c66556547b0c0d6bb7cb7d00e934 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 13:17:49 -0500
Subject: [PATCH 16/17] add more descriptive name to flattening matrix

---
 .github/workflows/benchmark-tmpl.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index ee58f4bb7..2eaaa7edd 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -49,7 +49,8 @@ env:
 
 jobs:
   flatten-search-space-matrix:
-    timeout-minutes: 2
+    name: get search space ${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }}
+    timeout-minutes: 180
     runs-on: ubuntu-latest
     outputs:
       flattened-matrix: ${{ steps.flatten.outputs.flattened-matrix }}

From 488315000f114c5c6ddaf7bd263e1a91a0eb8070 Mon Sep 17 00:00:00 2001
From: Cameron Quilici <cjquilici@gmail.com>
Date: Fri, 17 Oct 2025 13:26:01 -0500
Subject: [PATCH 17/17] fix moe backend

---
 benchmarks/dsr1_fp4_b200_trt_slurm.sh | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index b78e1ecfb..d13584078 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -18,19 +18,14 @@
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-# Default backend is TRTLLM
-if [[ "MOE_BACKEND" == "NONE" ]]; then
-    MOE_BACKEND="TRTLLM"
-fi
+# Default
+MOE_BACKEND="TRTLLM"
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
 
 hf download $MODEL
 
 # ========= Determine MOE_BACKEND based on ISL, OSL, CONC =========
-# Default
-MOE_BACKEND="TRTLLM"
-
 if [[ "$TP" == "4" ]]; then
     if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
         if [[ $CONC -ge 256 ]]; then