diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md
new file mode 100644
index 000000000..9021ec83e
--- /dev/null
+++ b/.github/configs/CONFIGS.md
@@ -0,0 +1,80 @@
+# Search Space Configuration
+
+This file defines which benchmark configurations to run for each model, GPU, and precision combination.
+
+## Quick Start
+
+Add a new configuration by following this pattern:
+```yaml
+model_name:
+  precision:
+    gpu_type:
+      sequence_length:
+        - { tp: [1, 2, 4], conc: { start: 4, end: 64 } }
+```
+
+## Field Definitions
+
+### Required Fields
+
+- **`tp`**: Tensor Parallelism (number of GPUs)
+  - Single value: `tp: 4`
+  - Multiple values: `tp: [2, 4, 8]`
+
+- **`conc`**: Concurrency (number of simultaneous requests)
+  - `start`: First value to test
+  - `end`: Last value to test
+  - `step`: Multiplier (default: 2)
+  - Example: `{start: 4, end: 64}` → tests [4, 8, 16, 32, 64]
+
+### Optional Fields
+
+- **`ep`**: Expert Parallelism for MoE models (default: 1)
+
+- **`dp_attention`**: Data Parallel Attention (default: `"false"`)
+
+## Examples
+
+### Basic configuration
+```yaml
+gptoss:
+  fp4:
+    h100:
+      1k1k:  # 1024 input, 1024 output
+        - { tp: [2, 4, 8], conc: { start: 4, end: 64 } }
+```
+This tests 15 combinations: 3 TP values × 5 concurrency values
+
+### Configuration with optional fields
+```yaml
+dsr1:
+  fp4:
+    b200-trt:
+      1k1k:
+        - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 256, end: 256 } }
+```
+
+### Custom step factor
+```yaml
+llama:
+  fp8:
+    b200:
+      1k8k:
+        - { tp: 2, conc: { start: 4, end: 64, step: 4 } }
+```
+This tests [4, 16, 64] (multiplies by 4 instead of default 2)
+
+## Key Points
+
+1. **Models**: `gptoss`, `llama`, `dsr1`
+2. **Precisions**: `fp4`, `fp8`
+3. **Sequence lengths**: `1k1k`, `1k8k`, `8k1k` (input×output)
+4. Each entry expands to test all combinations of TP and concurrency values
+5. There are comments throughout the yaml that were ported over from bash scripts describing what parallelism settings should be set depending on concurrency -- keep an eye out for those.
+
+## Testing Your Changes
+
+Run the flattening script to validate:
+```bash
+python utils/flatten_matrix.py
+```
\ No newline at end of file
diff --git a/.github/configs/search-space.yml b/.github/configs/search-space.yml
new file mode 100644
index 000000000..f89643d14
--- /dev/null
+++ b/.github/configs/search-space.yml
@@ -0,0 +1,425 @@
+gptoss:
+  fp4:
+    h100:
+      1k1k:
+      - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: [ 8 ], conc: { start: 4, end: 32 } }
+    h200:
+      1k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: 2, conc: { start: 4, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 1, conc: { start: 4, end: 16 } }
+      - { tp: 2, conc: { start: 4, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: [ 1, 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+    h200-trt:
+      1k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: 2, conc: { start: 4, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 32 } }
+      - { tp: 8, conc: { start: 4, end: 8 } }
+      1k8k:
+      - { tp: 1, conc: { start: 32, end: 64 } }
+      - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } }
+    b200:
+      1k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 8 } }
+      1k8k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 8 } }
+      8k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } }
+    b200-trt:
+      # NOTE: Regardless of TP, if CONC >= 256, DP_ATTENTION should be set
+      # to true and EP should be set to TP, i.e.,
+      # For lower concurrencies (CONC < 256), use TP Attention; Switch to 
+      # MoE Expert parallel for conurrency >=16 (1k1k and 1k8k)
+      1k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: 2, conc: { start: 4, end: 64 } }
+      # EP=4 iff TP=4 and CONC >= 16
+      - { tp: 4, conc: { start: 4, end: 8 } }
+      - { tp: 4, ep: 4, conc: { start: 16, end: 64 } }
+      # EP=8 iff TP=8 and CONC >= 16
+      - { tp: 8, conc: { start: 4, end: 8 } }
+      1k8k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: 2, conc: { start: 4, end: 64 } }
+      # EP=4 iff TP=4 and CONC >= 16
+      - { tp: 4, conc: { start: 4, end: 8 } }
+      - { tp: 4, ep: 4, conc: { start: 16, end: 64 } }
+      # EP=8 iff TP=8 and CONC >= 16
+      - { tp: 8, conc: { start: 4, end: 8 } }
+      8k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 8 } }
+    mi300x:
+      1k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      1k8k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      8k1k:
+      - { tp: [ 1, 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+    mi325x:
+      1k1k:
+      - { tp: [ 1, 2, 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 1, 4 ], conc: { start: 64, end: 64 } }
+      - { tp: [ 2, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 8 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+    mi355x:
+      1k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 16 } }
+      1k8k:
+      - { tp: [ 1, 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 1, conc: { start: 4, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 16 } }
+
+llama:
+  fp4:
+    b200:
+      # fix: add TP=2,4 to B200, just as mi355 has
+      1k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: 2, conc: { start: 32, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      1k8k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: 2, conc: { start: 32, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      8k1k:
+      - { tp: 1, conc: { start: 16, end: 64 } }
+      - { tp: 2, conc: { start: 16, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 32 } }
+      - { tp: 8, conc: { start: 4, end: 8 } }
+    b200-trt:
+      # fix: add TP=2,4 to B200, just as mi355 has
+      # B200 can achieve TPS/User >= 30 with larger concurrency till 128
+      1k1k:
+      - { tp: 1, conc: { start: 128, end: 128 } }
+      - { tp: 2, conc: { start: 64, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      1k8k:
+      - { tp: 1, conc: { start: 128, end: 128 } }
+      - { tp: 2, conc: { start: 64, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      8k1k:
+      - { tp: 1, conc: { start: 32, end: 128 } }
+      - { tp: 2, conc: { start: 16, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+    mi355x:
+      1k1k:
+      - { tp: 1, conc: { start: 32, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      1k8k:
+      - { tp: 1, conc: { start: 32, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      8k1k:
+      - { tp: 1, conc: { start: 32, end: 64 } }
+      - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+  fp8:
+    h100:
+      1k1k:
+      - { tp: 2, conc: { start: 64, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 2, conc: { start: 64, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 2, conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+    h200:
+      1k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: 2, conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 1, 2 ], conc: { start: 64, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: [ 1, 2 ], conc: { start: 16, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+    h200-trt:
+      # H200 can achieve TPS/User >= 30 with larger concurrency till 128
+      1k1k:
+      - { tp: 1, conc: { start: 128, end: 128 } }
+      - { tp: 2, conc: { start: 64, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      1k8k:
+      - { tp: 1, conc: { start: 128, end: 128 } }
+      - { tp: 2, conc: { start: 64, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      8k1k:
+      - { tp: [ 1, 2 ], conc: { start: 16, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+    b200:
+      # fix: add TP=2,4 to B200, just as mi355 has
+      1k1k:
+      - { tp: 1, conc: { start: 64, end: 64 } }
+      - { tp: 2, conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 1, 2 ], conc: { start: 64, end: 64 } }
+      - { tp: 4, conc: { start: 16, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 1, conc: { start: 32, end: 64 } }
+      - { tp: 2, conc: { start: 16, end: 64 } }
+      - { tp: 4, conc: { start: 4, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+    b200-trt:
+      # fix: add TP=2,4 to B200, just as mi355 has
+      # B200 can achieve TPS/User >= 30 with larger concurrency till 256
+      1k1k:
+      - { tp: 1, conc: { start: 128, end: 128 } }
+      - { tp: 2, conc: { start: 64, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      1k8k:
+      - { tp: 1, conc: { start: 128, end: 128 } }
+      - { tp: 2, conc: { start: 64, end: 128 } }
+      - { tp: 4, conc: { start: 16, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      8k1k:
+      - { tp: 1, conc: { start: 32, end: 128 } }
+      - { tp: 2, conc: { start: 16, end: 128 } }
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+    mi300x:
+      1k1k:
+      - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 1, 2 ], conc: { start: 64, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+    mi325x:
+      1k1k:
+      - { tp: [ 1, 2, 4 ], conc: { start: 32, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } }
+      - { tp: 4, conc: { start: 64, end: 64 } }
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 1, conc: { start: 16, end: 64 } }
+      - { tp: 2, conc: { start: 4, end: 32 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+    mi355x:
+      1k1k:
+      - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } }
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+
+dsr1:
+  fp4:
+    b200:
+      1k1k:
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 128 } }
+      1k8k:
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 128 } }
+      8k1k:
+      - { tp: 4, conc: { start: 4, end: 128 } }
+      - { tp: 8, conc: { start: 4, end: 16 } }
+    b200-trt:
+      # Determine DP_ATTENTION, and EP_SIZE based on ISL, OSL, CONC
+      # For ISL/OSL = 1k/1k
+      1k1k:
+      # If TP=4,
+      #   EP_SIZE=4 iff CONC > 32
+      #   DP_ATTENTION=true iff CONC >= 256
+      - { tp: 4, conc: { start: 4, end: 32 } }
+      - { tp: 4, ep: 4, conc: { start: 64, end: 128 } }
+      - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 256, end: 256 } }
+      # If TP=8,
+      #   EP_SIZE=8 iff CONC > 8
+      #   DP_ATTENTION=true iff CONC >= 256
+      - { tp: 8, conc: { start: 4, end: 8 } }
+      - { tp: 8, ep: 8, conc: { start: 16, end: 128 } }
+      - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 256, end: 256 } }
+      # For ISL/OSL = 1k/8k
+      1k8k:
+      # If TP=4,
+      #   EP_SIZE=4 iff CONC > 32
+      #   DP_ATTENTION=true iff CONC >= 256
+      - { tp: 4, conc: { start: 4, end: 32 } }
+      - { tp: 4, ep: 4, conc: { start: 64, end: 128 } }
+      - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 256, end: 256 } }
+      # If TP=8,
+      #   EP_SIZE=8 iff CONC > 16
+      #   DP_ATTENTION=true iff CONC >= 256
+      - { tp: 8, conc: { start: 4, end: 16 } }
+      - { tp: 8, ep: 8, conc: { start: 32, end: 128 } }
+      - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 256, end: 256 } }
+      # For ISL/OSL = 8k/1k
+      8k1k:
+      # If TP=4,
+      #   EP_SIZE=4 and DP_ATTENTION=true iff CONC > 32
+      - { tp: 4, conc: { start: 4, end: 32 } }
+      - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 64, end: 256 } }
+      # If TP=8,
+      #   EP_SIZE=8 and DP_ATTENTION=true iff CONC > 32
+      - { tp: 8, conc: { start: 4, end: 32 } }
+      - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 256 } }
+    gb200:
+      1k1k:
+      - { tp: 12, conc: { start: 4300, end: 4300 } }
+      - { tp: 24, conc: { start: 4300, end: 4300 } }
+      - { tp: 24, conc: { start: 2048, end: 2048 } }
+      - { tp: 20, conc: { start: 1075, end: 1075 } }
+      - { tp: 36, conc: { start: 1075, end: 1075 } }
+      - { tp: 36, conc: { start: 564, end: 564 } }
+      - { tp: 36, conc: { start: 4, end: 256 } }
+      8k1k:
+      - { tp: 28, conc: { start: 2150, end: 2150 } }
+      - { tp: 48, conc: { start: 2150, end: 2150 } }
+      - { tp: 40, conc: { start: 1075, end: 1075 } }
+      - { tp: 48, conc: { start: 538, end: 538 } }
+      - { tp: 48, conc: { start: 256, end: 256 } }
+      - { tp: 28, conc: { start: 102, end: 102 } }
+      - { tp: 28, conc: { start: 3, end: 48 } }
+    gb200-mtp:
+      1k1k:
+      - { tp: 12, conc: { start: 2252, end: 2252 } }
+      - { tp: 24, conc: { start: 2150, end: 2150 } }
+      - { tp: 20, conc: { start: 1075, end: 1075 } }
+      - { tp: 20, conc: { start: 512, end: 512 } }
+      - { tp: 36, conc: { start: 512, end: 512 } }
+      - { tp: 36, conc: { start: 144, end: 144 } }
+      - { tp: 36, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 32, conc: { start: 2150, end: 2150 } }
+      - { tp: 48, conc: { start: 1075, end: 1075 } }
+      - { tp: 64, conc: { start: 538, end: 538 } }
+      - { tp: 52, conc: { start: 269, end: 269 } }
+      - { tp: 52, conc: { start: 128, end: 128 } }
+      - { tp: 28, conc: { start: 54, end: 54 } }
+      - { tp: 28, conc: { start: 3, end: 24 } }
+    mi355x:
+      1k1k:
+      - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+  fp8:
+    h200:
+      1k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+    h200-trt:
+      # Determine DP_ATTENTION and EP_SIZE based on ISL, OSL, CONC
+      # If ISL/OSL = 1k/1k, DP_ATTENTION=true iff CONC > 64
+      1k1k:
+      - { tp: 8, ep: 8, conc: { start: 4, end: 64 } }
+      # If ISL/OSL = 1k/8k, DP_ATTENTION=true iff CONC > 64
+      1k8k:
+      - { tp: 8, ep: 8, conc: { start: 4, end: 64 } }
+      # If ISL/OSL = 8k/1k, DP_ATTENTION=true iff CONC > 32
+      8k1k:
+      - { tp: 8, ep: 8, conc: { start: 4, end: 32 } }
+      - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 64 } }
+    b200:
+      1k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+    b200-trt:
+      # Determine DP_ATTENTION and EP_SIZE based on ISL, OSL, CONC
+      # If ISL/OSL = 1k/1k, DP_ATTENTION=true iff CONC > 32
+      1k1k:
+      - { tp: 8, ep: 8, conc: { start: 4, end: 32 } }
+      - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 64 } }
+      # If ISL/OSL = 1k/8k, DP_ATTENTION=true iff CONC > 64
+      1k8k:
+      - { tp: 8, ep: 8, conc: { start: 4, end: 64 } }
+      # If ISL/OSL = 8k/1k, DP_ATTENTION=true iff CONC > 64
+      8k1k:
+      - { tp: 8, ep: 8, conc: { start: 4, end: 64 } }
+    gb200:
+      1k1k:
+      - { tp: 72, conc: { start: 8192, end: 8192 } }
+      - { tp: 72, conc: { start: 6144, end: 6144 } }
+      - { tp: 72, conc: { start: 5632, end: 5632 } }
+      - { tp: 72, conc: { start: 5376, end: 5376 } }
+      - { tp: 72, conc: { start: 5120, end: 5120 } }
+      - { tp: 72, conc: { start: 4992, end: 4992 } }
+      - { tp: 72, conc: { start: 4864, end: 4864 } }
+      - { tp: 72, conc: { start: 4608, end: 4608 } }
+      - { tp: 72, conc: { start: 1024, end: 4096 } }
+      8k1k:
+      - { tp: 72, conc: { start: 128, end: 4096 } }
+      - { tp: 72, conc: { start: 576, end: 576 } }
+      - { tp: 72, conc: { start: 448, end: 448 } }
+      - { tp: 72, conc: { start: 384, end: 384 } }
+    mi300x:
+      1k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+    mi325x:
+      1k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+    mi355x:
+      1k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      1k8k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
+      8k1k:
+      - { tp: 8, conc: { start: 4, end: 64 } }
diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml
index 3d1dd5051..cb9776b64 100644
--- a/.github/workflows/70b-tmpl.yml
+++ b/.github/workflows/70b-tmpl.yml
@@ -54,7 +54,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[2, 4, 8]'
 
   bmk-h200-fp8:
     if: ${{ inputs.use_h200 }}
@@ -71,7 +70,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
 
   bmk-h200-trt-fp8:
     if: ${{ inputs.use_h200 }}
@@ -88,8 +86,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'  
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # H200 can achieve TPS/User >= 30 with larger concurrency till 128
 
   bmk-b200-fp8:
     if: ${{ inputs.use_b200 }}
@@ -106,7 +102,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
 
   bmk-b200-trt-fp8:
     if: ${{ inputs.use_b200 }}
@@ -123,8 +118,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # B200 can achieve TPS/User >= 30 with larger concurrency till 256
 
   bmk-mi300x-fp8:
     if: ${{ inputs.use_mi300x }}
@@ -141,7 +134,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
 
   bmk-mi325x-fp8:
     if: ${{ inputs.use_mi325x }}
@@ -158,7 +150,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
 
   bmk-mi355x-fp8:
     if: ${{ inputs.use_mi355x }}
@@ -175,7 +166,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
 
   bmk-b200-fp4:
     if: ${{ inputs.use_b200 }}
@@ -192,7 +182,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'  # fix: add TP=2,4 to B200, just as mi355 has
 
   bmk-b200-trt-fp4:
     if: ${{ inputs.use_b200 }}
@@ -209,8 +198,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # B200 can achieve TPS/User >= 30 with larger concurrency till 128
 
   bmk-mi355x-fp4:
     if: ${{ inputs.use_mi355x }}
@@ -226,5 +213,4 @@ jobs:
       isl: ${{ inputs.isl }}
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[1, 2, 4, 8]'
+      random-range-ratio: ${{ inputs.random-range-ratio }}
\ No newline at end of file
diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml
index 07f5b876d..b5bcc5817 100644
--- a/.github/workflows/benchmark-multinode-tmpl.yml
+++ b/.github/workflows/benchmark-multinode-tmpl.yml
@@ -84,6 +84,8 @@ jobs:
             exit 1
           fi
 
+        # NOTE: https://github.com/InferenceMAX/InferenceMAX/pull/111 adds EP_SIZE and DP_ATTENTION parsing to the process_results.py script
+        # but it is not yet implemented for GB200 multi-node, therefore just default to: 1 "false"
       - name: Process results
         run: |
           # Process each result file
@@ -93,7 +95,7 @@ jobs:
               # Extract GPU count from filename for tp_size calculation
               gpus=$(echo "$result_file" | sed "s/.*_gpus\([0-9]*\)\.json/\1/")
               if [ -n "$gpus" ]; then
-                python3 utils/process_result.py ${{ inputs.runner }} $gpus ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE
+                python3 utils/process_result.py ${{ inputs.runner }} $gpus 1 "false" ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE
               fi
             fi
           done
diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml
index 313087946..2eaaa7edd 100644
--- a/.github/workflows/benchmark-tmpl.yml
+++ b/.github/workflows/benchmark-tmpl.yml
@@ -32,17 +32,12 @@ on:
       random-range-ratio:
         required: true
         type: string
-      tp-list:
-        required: true
-        type: string
-      conc-list:
-        type: string
-        default: '[4, 8, 16, 32, 64]'
 
 env:
   HF_TOKEN: ${{ secrets.HF_TOKEN }}
   HF_HUB_CACHE: '/mnt/hf_hub_cache/'
   EXP_NAME: ${{ inputs.exp-name }}
+  RUNNER: ${{ inputs.runner }}
   MODEL: ${{ inputs.model }}
   ISL: ${{ inputs.isl }}
   OSL: ${{ inputs.osl }}
@@ -53,20 +48,38 @@ env:
   PRECISION: ${{ inputs.precision }}
 
 jobs:
+  flatten-search-space-matrix:
+    name: get search space ${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }}
+    timeout-minutes: 180
+    runs-on: ubuntu-latest
+    outputs:
+      flattened-matrix: ${{ steps.flatten.outputs.flattened-matrix }}
+    steps:
+      - uses: actions/checkout@v3
+        with:
+          token: ${{ secrets.REPO_PAT }}
+          fetch-depth: 0
+          
+      - id: flatten
+        run: python3 ${GITHUB_WORKSPACE}/utils/flatten_matrix.py
+
   benchmark:
     runs-on: ${{ inputs.runner }}
+    needs: flatten-search-space-matrix
     timeout-minutes: 180
 
     strategy:
       fail-fast: false
       matrix:
-        tp: ${{ fromJson(inputs.tp-list) }}
-        conc: ${{ fromJson(inputs.conc-list) }}
-    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.tp }} conc${{ matrix.conc }}'
+        config: ${{ fromJson(needs.flatten-search-space-matrix.outputs.flattened-matrix) }}
+
+    name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}'
 
     env:
-      TP: ${{ matrix.tp }}
-      CONC: ${{ matrix.conc }}
+      TP: ${{ matrix.config.tp }}
+      CONC: ${{ matrix.config.conc }}
+      EP_SIZE: ${{ matrix.config.ep || 1 }}
+      DP_ATTENTION: ${{ matrix.config.dp_attention || 'false' }}
 
     steps:
       - name: Resource cleanup
@@ -139,7 +152,7 @@ jobs:
 
       - name: Process result
         run: |
-          python3 utils/process_result.py ${{ inputs.runner }} $TP $RESULT_FILENAME $FRAMEWORK $PRECISION
+          python3 utils/process_result.py ${{ inputs.runner }} $TP $EP_SIZE $DP_ATTENTION $RESULT_FILENAME $FRAMEWORK $PRECISION
 
       - name: Upload result
         uses: actions/upload-artifact@v4
diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml
index 3a48710f2..c8b7fd64d 100644
--- a/.github/workflows/dsr1-tmpl.yml
+++ b/.github/workflows/dsr1-tmpl.yml
@@ -40,22 +40,21 @@ on:
         default: false
 
 jobs:
-  bmk-h200-fp8:
-    if: ${{ inputs.use_h200 }}
-    uses: ./.github/workflows/benchmark-tmpl.yml
-    secrets: inherit
-    with:
-      runner: h200
-      image: 'lmsysorg/sglang:v0.5.2rc2-cu126'
-      model: 'deepseek-ai/DeepSeek-R1-0528'
-      framework: 'sglang'
-      precision: 'fp8'
-      exp-name: ${{ inputs.exp-name }}
-      isl: ${{ inputs.isl }}
-      osl: ${{ inputs.osl }}
-      max-model-len: ${{ inputs.max-model-len }}
-      random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
+  # bmk-h200-fp8:
+  #   if: ${{ inputs.use_h200 }}
+  #   uses: ./.github/workflows/benchmark-tmpl.yml
+  #   secrets: inherit
+  #   with:
+  #     runner: h200
+  #     image: 'lmsysorg/sglang:v0.5.2rc2-cu126'
+  #     model: 'deepseek-ai/DeepSeek-R1-0528'
+  #     framework: 'sglang'
+  #     precision: 'fp8'
+  #     exp-name: ${{ inputs.exp-name }}
+  #     isl: ${{ inputs.isl }}
+  #     osl: ${{ inputs.osl }}
+  #     max-model-len: ${{ inputs.max-model-len }}
+  #     random-range-ratio: ${{ inputs.random-range-ratio }}
 
   bmk-h200-trt-fp8:
     if: ${{ inputs.use_h200 }}
@@ -72,7 +71,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
 
   bmk-b200-fp8:
     if: ${{ inputs.use_b200 }}
@@ -89,7 +87,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
 
   bmk-b200-trt-fp8:
     if: ${{ inputs.use_b200 }}
@@ -106,7 +103,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
 
   bmk-mi300x-fp8:
     if: ${{ inputs.use_mi300x }}
@@ -123,7 +119,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
 
   bmk-mi325x-fp8:
     if: ${{ inputs.use_mi325x }}
@@ -140,7 +135,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
 
   bmk-mi355x-fp8:
     if: ${{ inputs.use_mi355x }}
@@ -157,7 +151,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[8]'
 
   bmk-b200-fp4:
     if: ${{ inputs.use_b200 }}
@@ -174,8 +167,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[4,8]'
-      conc-list: '[4, 8, 16, 32, 64, 128]'  # Custom concurrency values for this job
 
   bmk-b200-trt-fp4:
     if: ${{ inputs.use_b200 }}
@@ -192,8 +183,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      tp-list: '[4, 8]'
-      conc-list: '[4, 8, 16, 32, 64, 128, 256]'  # DPA4EP4 is already 30 tok/s/user and DPA8EP8 is already 35tok/s/user. 512 conc would be too much so we skipping it
 
   bmk-mi355x-fp4:
     if: ${{ inputs.use_mi355x }}
@@ -210,8 +199,6 @@ jobs:
       osl: ${{ inputs.osl }}
       max-model-len: ${{ inputs.max-model-len }}
       random-range-ratio: ${{ inputs.random-range-ratio }}
-      # These tensor parallelism settings are not necessary as they cannot fall on the Pareto frontier with this particular container - we remove them to save CI time.
-      tp-list: ${{ inputs.isl == 1024 && inputs.osl == 1024 && '[4, 8]' || '[8]' }}
 
   bmk-gb200-fp4-multinode-mtp-off:
     if: ${{ inputs.use_gb200 && !(inputs.isl == '1024' && inputs.osl == '8192') }}
diff --git a/.github/workflows/gptoss-tmpl.yml b/.github/workflows/gptoss-tmpl.yml
index 0c505de07..03a205ff5 100644
--- a/.github/workflows/gptoss-tmpl.yml
+++ b/.github/workflows/gptoss-tmpl.yml
@@ -52,7 +52,6 @@ jobs:
       runner: h100
       image: 'vllm/vllm-openai:v0.10.2'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[2, 4, 8]'
       framework: 'vllm'
       precision: 'fp4'
 
@@ -69,7 +68,6 @@ jobs:
       runner: h200
       image: 'vllm/vllm-openai:v0.10.2'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
       framework: 'vllm'
       precision: 'fp4'
 
@@ -86,7 +84,6 @@ jobs:
       runner: b200
       image: 'vllm/vllm-openai:v0.10.2'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
       framework: 'vllm'
       precision: 'fp4'
 
@@ -103,7 +100,6 @@ jobs:
       runner: b200-trt
       image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
       framework: 'trt'
       precision: 'fp4'
 
@@ -120,7 +116,6 @@ jobs:
       runner: h200-trt
       image: 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
       framework: 'trt'
       precision: 'fp4'
 
@@ -137,7 +132,6 @@ jobs:
       runner: mi300x
       image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
       framework: 'vllm'
       precision: 'fp4'
 
@@ -154,7 +148,6 @@ jobs:
       runner: mi325x
       image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 2, 4, 8]'
       framework: 'vllm'
       precision: 'fp4'
 
@@ -171,6 +164,5 @@ jobs:
       runner: mi355x
       image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
       model: 'openai/gpt-oss-120b'
-      tp-list: '[1, 4, 8]'
       framework: 'vllm'
       precision: 'fp4'
diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
index ffdae541c..d13584078 100644
--- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh
@@ -13,69 +13,50 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
+# EP_SIZE
+# DP_ATTENTION
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+# Default
+MOE_BACKEND="TRTLLM"
 
-hf download $MODEL
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
 
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="1"
-MOE_BACKEND="TRTLLM"
-DP_ATTENTION=false
+hf download $MODEL
 
+# ========= Determine MOE_BACKEND based on ISL, OSL, CONC =========
 if [[ "$TP" == "4" ]]; then
     if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-        if [[ $CONC -gt 32 ]]; then
-            EP_SIZE="$TP"
-        fi
         if [[ $CONC -ge 256 ]]; then
-            DP_ATTENTION=true
             MOE_BACKEND="CUTLASS"
         fi
     elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-        if [[ $CONC -gt 32 ]]; then
-            EP_SIZE="$TP"
-        fi
         if [[ $CONC -ge 256 ]]; then
-            DP_ATTENTION=true
             MOE_BACKEND="CUTLASS"
         fi
     elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
         if [[ $CONC -gt 32 ]]; then
-            EP_SIZE="$TP"
-            DP_ATTENTION=true
             MOE_BACKEND="CUTLASS"
         fi
     fi
 elif [[ "$TP" == "8" ]]; then
     if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-        if [[ $CONC -gt 8 ]]; then
-            EP_SIZE="$TP"
-        fi
         if [[ $CONC -ge 256 ]]; then
-            DP_ATTENTION=true
             MOE_BACKEND="CUTLASS"
         fi
     elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-        if [[ $CONC -gt 16 ]]; then
-            EP_SIZE="$TP"
-        fi
         if [[ $CONC -ge 256 ]]; then
-            DP_ATTENTION=true
             MOE_BACKEND="CUTLASS"
         fi
     elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
         if [[ $CONC -gt 32 ]]; then
-            EP_SIZE="$TP"
-            DP_ATTENTION=true
             MOE_BACKEND="CUTLASS"
         fi
     fi
 fi
 
-echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
+echo "MOE_BACKEND set to '$MOE_BACKEND'"
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
index e909b954a..6bc8c9fa7 100644
--- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh
@@ -13,33 +13,16 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
+# EP_SIZE
+# DP_ATTENTION
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
-
-hf download $MODEL
-
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="$TP"
 MOE_BACKEND="DEEPGEMM"
-DP_ATTENTION=false
 
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -gt 32 ]]; then
-        DP_ATTENTION=true
-    fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC -gt 64 ]]; then
-        DP_ATTENTION=true
-    fi
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -gt 64 ]]; then
-        DP_ATTENTION=true
-    fi
-fi
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND"
 
-echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
+hf download $MODEL
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
index 20101e466..5dfdf8617 100644
--- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh
+++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh
@@ -13,33 +13,16 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
+# EP_SIZE
+# DP_ATTENTION
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
-
-hf download $MODEL
-
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="$TP"
 MOE_BACKEND="CUTLASS"
-DP_ATTENTION=false
 
-if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -gt 64 ]]; then
-        DP_ATTENTION=true
-    fi
-elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-    if [[ $CONC -gt 64 ]]; then
-        DP_ATTENTION=true
-    fi
-elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then
-    if [[ $CONC -gt 32 ]]; then
-        DP_ATTENTION=true
-    fi
-fi
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND: $MOE_BACKEND"
 
-echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
+hf download $MODEL
 
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
index 96745306a..4f17d4d4f 100644
--- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh
+++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh
@@ -13,48 +13,31 @@
 # CONC
 # RESULT_FILENAME
 # PORT_OFFSET
+# EP_SIZE
+# DP_ATTENTION
 
 # GPTOSS TRTLLM Deployment Guide:
 # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md
 
 echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION"
 
 hf download $MODEL
 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 PORT=$(( 8888 + $PORT_OFFSET ))
 
-# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC =========
-EP_SIZE="1"
+# ========= Determine MOE_BACKEND based on ISL, OSL, CONC =========
+# Default
 MOE_BACKEND="TRTLLM"
-DP_ATTENTION=false
-
-# Lower concurrencies: Concurrency < 256
-# MoE backend=TRTLLM
-# Use TP Attention; Switch to MoE Expert parallel for conurrency >=16 (1k1k and 1k8k)
-TEP_REQUIRED=false
-if [[ "$TP" == "4" || "$TP" == "8" ]]; then 
-    if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
-        TEP_REQUIRED=true
-    elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
-        TEP_REQUIRED=true
-    fi
-fi
-if [[ "$TEP_REQUIRED" == "true" && $CONC -ge 16 ]]; then
-    EP_SIZE="$TP"
-fi
 
 # Higher concurrencies: Concurrency >= 256
 #   MoE Backend = CUTLASS
-#   Use DP attention with expert parallel MoE
 if [[ $CONC -ge 256 ]]; then
-    EP_SIZE="$TP"
-    DP_ATTENTION=true
     MOE_BACKEND="CUTLASS"
 fi
 
-echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'"
+echo "MOE_BACKEND set to $MOE_BACKEND"
 
 EXTRA_CONFIG_FILE="gptoss-fp4.yml"
 export TRTLLM_ENABLE_PDL=1
diff --git a/utils/count_num_jobs.py b/utils/count_num_jobs.py
new file mode 100644
index 000000000..776fd2a65
--- /dev/null
+++ b/utils/count_num_jobs.py
@@ -0,0 +1,37 @@
+import yaml
+from collections import defaultdict
+
+with open('.github/configs/search-space.yml', 'r') as f:
+    data = yaml.safe_load(f)
+
+gpu_totals = defaultdict(int)
+overall_total = 0
+
+for model in data.values():
+    for precision in model.values():
+        for gpu, runner_data in precision.items():
+            for seq_len in runner_data.values():
+                for entry in seq_len:
+                    # Count TP values
+                    tp_list = entry['tp'] if isinstance(entry['tp'], list) else [entry['tp']]
+                    tp_count = len(tp_list)
+                    
+                    # Count CONC values
+                    conc = entry['conc']
+                    start, end = conc['start'], conc['end']
+                    step = conc.get('step', 2)
+                    
+                    conc_count = 0
+                    current = start
+                    while current <= end:
+                        conc_count += 1
+                        current *= step
+                    
+                    combo_count = tp_count * conc_count
+                    gpu_totals[gpu] += combo_count
+                    overall_total += combo_count
+
+print("Breakdown by GPU:")
+for gpu in sorted(gpu_totals.keys()):
+    print(f"  {gpu}: {gpu_totals[gpu]}")
+print(f"\nTotal combinations: {overall_total}")
\ No newline at end of file
diff --git a/utils/flatten_matrix.py b/utils/flatten_matrix.py
new file mode 100644
index 000000000..7c75058d7
--- /dev/null
+++ b/utils/flatten_matrix.py
@@ -0,0 +1,98 @@
+import yaml
+import os
+import json
+
+def flatten_search_space(config_path, runner, model, isl, osl, precision):
+    with open(config_path, 'r') as f:
+        search_space = yaml.safe_load(f)
+
+    seq_len_map = {
+        '1024': '1k',
+        '8192': '8k',
+    }
+
+    model_map = {
+        'gpt-oss': 'gptoss',
+        'llama-3.3-70b-instruct': 'llama',
+        'deepseek-r1-0528': 'dsr1',
+    }
+
+    model_key = None
+    model_lower = model.lower()
+    for key, value in model_map.items():
+        if key.lower() in model_lower:
+            model_key = value
+            break
+
+    assert model_key, f"model '{model}' not recognized"
+
+    assert seq_len_map.get(isl) and seq_len_map.get(osl), f"either isl or osl not recognized"
+    seq_len = f"{seq_len_map[isl]}{seq_len_map[osl]}"
+
+    assert search_space.get(model_key, {}).get(precision), f"precision '{precision}' not recognized"
+    assert search_space.get(model_key, {}).get(precision).get(runner), f"runner '{runner}' not recognized"
+
+    entries = search_space.get(model_key, {}).get(
+        precision).get(runner, {}).get(seq_len, [])
+
+    flattened_search_space = []
+    for entry in entries:
+        assert entry.get('tp'), f"entry malformed, expecting field 'tp'"
+        tp = entry.get('tp')
+        assert isinstance(tp, int) or (isinstance(tp, list) and all(isinstance(x, int) for x in tp)), \
+            f"entry malformed, expecting field 'tp' to be either an int or list of ints"
+        
+        tp_list = entry['tp'] if isinstance(entry['tp'], list) else [entry['tp']]
+
+        conc_config = entry.get('conc')
+        
+        assert conc_config, f"entry malformed, missing field 'conc'"
+        assert isinstance(conc_config, dict), f"entry malformed, 'conc' must be a dict"
+        assert 'start' in conc_config, f"entry malformed, 'conc' missing required field 'start'"
+        assert 'end' in conc_config, f"entry malformed, 'conc' missing required field 'end'"
+        assert isinstance(conc_config['start'], int), f"entry malformed, 'conc.start' must be an int"
+        assert isinstance(conc_config['end'], int), f"entry malformed, 'conc.end' must be an int"
+        assert conc_config['start'] <= conc_config['end'], f"entry malformed, 'conc.start' must be <= 'conc.end'"
+        
+        start = conc_config['start']
+        end = conc_config['end']
+        step_factor = conc_config.get('step', 2)
+        
+        if 'step' in conc_config:
+            assert isinstance(step_factor, int), f"entry malformed, 'conc.step' must be an int"
+            assert step_factor > 1, f"entry malformed, 'conc.step' must be > 1"
+        
+        conc_list = []
+        current = start
+        while current <= end:
+            conc_list.append(current)
+            current *= step_factor
+
+        for tp_value in tp_list:
+            for conc_value in conc_list:
+                new_entry = entry.copy()
+                new_entry['tp'] = tp_value
+                new_entry['conc'] = conc_value
+                flattened_search_space.append(new_entry)
+
+    return flattened_search_space
+
+
+def main():
+    config_path = '.github/configs/search-space.yml'
+    runner = os.environ['RUNNER']
+    model = os.environ['MODEL']
+    isl = os.environ['ISL']
+    osl = os.environ['OSL']
+    precision = os.environ['PRECISION']
+    
+    flattened_search_space = flatten_search_space(
+        config_path, runner, model, isl, osl, precision
+    )
+    
+    with open(os.environ['GITHUB_OUTPUT'], 'a') as f:
+        f.write(f"flattened-matrix={json.dumps(flattened_search_space)}\n")
+
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
diff --git a/utils/process_result.py b/utils/process_result.py
index 89c4aa7b3..a59d1f7f3 100644
--- a/utils/process_result.py
+++ b/utils/process_result.py
@@ -5,9 +5,11 @@
 
 hw = sys.argv[1]
 tp_size = int(sys.argv[2])
-result_filename = sys.argv[3]
-framework = sys.argv[4]
-precision = sys.argv[5]
+ep_size = int(sys.argv[3])
+dp_attention = sys.argv[4]
+result_filename = sys.argv[5]
+framework = sys.argv[6]
+precision = sys.argv[7]
 
 with open(f'{result_filename}.json') as f:
     bmk_result = json.load(f)
@@ -15,7 +17,9 @@
 data = {
     'hw': hw,
     'tp': tp_size,
+    'ep': ep_size,
     'conc': int(bmk_result['max_concurrency']),
+    'dp_attention': dp_attention, # true or false
     'model': bmk_result['model_id'],
     'framework': framework,
     'precision': precision,
@@ -23,8 +27,8 @@
     'output_tput_per_gpu': float(bmk_result['output_throughput']) / tp_size
 }
 
-if len(sys.argv) == 7:  # MTP
-    data['mtp'] = sys.argv[6]
+if len(sys.argv) == 9:  # MTP
+    data['mtp'] = sys.argv[8]
 
 for key, value in bmk_result.items():
     if key.endswith('ms'):
diff --git a/utils/summarize.py b/utils/summarize.py
index 1f78caf9c..de8863c78 100644
--- a/utils/summarize.py
+++ b/utils/summarize.py
@@ -9,11 +9,11 @@
     with open(result_path) as f:
         result = json.load(f)
     results.append(result)
-results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['conc']))
+results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['ep'], r['conc']))
 
 summary_header = f'''\
-| Hardware | Framework | Precision | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
-| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\
+| Hardware | Framework | Precision | TP | EP | Conc | DP Attention | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU |
+| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\
 '''
 print(summary_header)
 
@@ -25,7 +25,9 @@
         f"| {framework.upper()} "
         f"| {precision.upper()} "
         f"| {result['tp']} "
+        f"| {result['ep']} "
         f"| {result['conc']} "
+        f"| {result['dp_attention']} "
         f"| {(result['median_ttft'] * 1000):.4f} "
         f"| {(result['median_tpot'] * 1000):.4f} "
         f"| {result['median_e2el']:.4f} "
diff --git a/utils/test_flatten_matrix.py b/utils/test_flatten_matrix.py
new file mode 100644
index 000000000..b4f0620ad
--- /dev/null
+++ b/utils/test_flatten_matrix.py
@@ -0,0 +1,469 @@
+import pytest
+import yaml
+from flatten_matrix import flatten_search_space
+
+
+@pytest.fixture
+def minimal_config():
+    """Minimal valid config for testing"""
+    return {
+        'gptoss': {
+            'fp4': {
+                'h100': {
+                    '1k1k': [
+                        {'tp': [2, 4], 'conc': {'start': 4, 'end': 8}}
+                    ]
+                }
+            }
+        },
+        'llama': {
+            'fp8': {
+                'b200': {
+                    '1k8k': [
+                        {'tp': 2, 'conc': {'start': 4, 'end': 64, 'step': 4}}
+                    ]
+                }
+            }
+        },
+        'dsr1': {
+            'fp4': {
+                'b200-trt': {
+                    '8k1k': [
+                        {'tp': 4, 'conc': {'start': 4, 'end': 32}},
+                        {'tp': 4, 'ep': 4, 'dp_attention': 'true', 'conc': {'start': 64, 'end': 256}}
+                    ]
+                }
+            }
+        }
+    }
+
+
+@pytest.fixture
+def config_file(minimal_config, tmp_path):
+    # temp config file
+    config_path = tmp_path / "search-space.yml"
+    with open(config_path, 'w') as f:
+        yaml.dump(minimal_config, f)
+    return config_path
+
+
+class TestValidCases:
+    """Test valid input scenarios"""
+    
+    def test_single_tp_value(self, config_file):
+        """Test with single TP value"""
+        result = flatten_search_space(
+            config_file, 'b200', 'llama-3.3-70b-instruct', '1024', '8192', 'fp8'
+        )
+        
+        # Should generate: tp=2, conc=[4, 16, 64] with step=4
+        assert len(result) == 3
+        assert all(entry['tp'] == 2 for entry in result)
+        assert [entry['conc'] for entry in result] == [4, 16, 64]
+    
+    def test_list_of_tp_values(self, config_file):
+        """Test with list of TP values"""
+        result = flatten_search_space(
+            config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp4'
+        )
+        
+        # Should generate: tp=[2,4], conc=[4,8] = 2*2 = 4 combinations
+        assert len(result) == 4
+        tp_values = [entry['tp'] for entry in result]
+        assert tp_values.count(2) == 2
+        assert tp_values.count(4) == 2
+    
+    def test_optional_fields_preserved(self, config_file):
+        """Test that optional fields like ep and dp_attention are preserved"""
+        result = flatten_search_space(
+            config_file, 'b200-trt', 'deepseek-r1-0528', '8192', '1024', 'fp4'
+        )
+        
+        # Second entry should have ep and dp_attention
+        entries_with_ep = [e for e in result if 'ep' in e]
+        assert len(entries_with_ep) > 0
+        assert all(e['ep'] == 4 for e in entries_with_ep)
+        
+        entries_with_dp = [e for e in result if 'dp_attention' in e]
+        assert len(entries_with_dp) > 0
+        assert all(e['dp_attention'] == 'true' for e in entries_with_dp)
+    
+    def test_default_step_factor(self, config_file):
+        """Test that default step factor of 2 is used when not specified"""
+        result = flatten_search_space(
+            config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp4'
+        )
+        
+        # conc: start=4, end=8, default step=2 -> [4, 8]
+        conc_values = sorted(set(entry['conc'] for entry in result))
+        assert conc_values == [4, 8]
+    
+    def test_custom_step_factor(self, config_file):
+        """Test custom step factor"""
+        result = flatten_search_space(
+            config_file, 'b200', 'llama-3.3-70b-instruct', '1024', '8192', 'fp8'
+        )
+        
+        # conc: start=4, end=64, step=4 -> [4, 16, 64]
+        conc_values = sorted(set(entry['conc'] for entry in result))
+        assert conc_values == [4, 16, 64]
+
+
+class TestModelMapping:
+    """Test model name mapping"""
+    
+    def test_gptoss_mapping(self, config_file):
+        """Test gpt-oss maps to gptoss"""
+        result = flatten_search_space(
+            config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp4'
+        )
+        assert len(result) > 0
+    
+    def test_llama_mapping(self, config_file):
+        """Test llama mapping with case insensitivity"""
+        result = flatten_search_space(
+            config_file, 'b200', 'LLAMA-3.3-70B-INSTRUCT', '1024', '8192', 'fp8'
+        )
+        assert len(result) > 0
+    
+    def test_dsr1_mapping(self, config_file):
+        """Test deepseek-r1 maps to dsr1"""
+        result = flatten_search_space(
+            config_file, 'b200-trt', 'deepseek-r1-0528', '8192', '1024', 'fp4'
+        )
+        assert len(result) > 0
+
+
+class TestInvalidInputs:
+    """Test error handling for invalid inputs"""
+    
+    def test_unrecognized_model(self, config_file):
+        """Test error for unrecognized model"""
+        with pytest.raises(AssertionError, match="model .* not recognized"):
+            flatten_search_space(
+                config_file, 'h100', 'unknown-model', '1024', '1024', 'fp4'
+            )
+    
+    def test_invalid_isl(self, config_file):
+        """Test error for invalid ISL"""
+        with pytest.raises(AssertionError, match="either isl or osl not recognized"):
+            flatten_search_space(
+                config_file, 'h100', 'gpt-oss', '2048', '1024', 'fp4'
+            )
+    
+    def test_invalid_osl(self, config_file):
+        """Test error for invalid OSL"""
+        with pytest.raises(AssertionError, match="either isl or osl not recognized"):
+            flatten_search_space(
+                config_file, 'h100', 'gpt-oss', '1024', '4096', 'fp4'
+            )
+    
+    def test_invalid_precision(self, config_file):
+        """Test error for invalid precision"""
+        with pytest.raises(AssertionError, match="precision .* not recognized"):
+            flatten_search_space(
+                config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp16'
+            )
+    
+    def test_invalid_runner(self, config_file):
+        """Test error for invalid runner"""
+        with pytest.raises(AssertionError, match="runner .* not recognized"):
+            flatten_search_space(
+                config_file, 'a100', 'gpt-oss', '1024', '1024', 'fp4'
+            )
+
+
+class TestMalformedEntries:
+    """Test validation of malformed config entries"""
+    
+    def test_missing_tp_field(self, tmp_path):
+        """Test error when tp field is missing"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'conc': {'start': 4, 'end': 8}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="entry malformed, expecting field 'tp'"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_tp_wrong_type(self, tmp_path):
+        """Test error when tp is wrong type"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 'invalid', 'conc': {'start': 4, 'end': 8}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="expecting field 'tp' to be either an int or list of ints"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_tp_list_with_non_ints(self, tmp_path):
+        """Test error when tp list contains non-integers"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': [2, 'four', 8], 'conc': {'start': 4, 'end': 8}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="expecting field 'tp' to be either an int or list of ints"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_missing_conc_field(self, tmp_path):
+        """Test error when conc field is missing"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="entry malformed, missing field 'conc'"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_conc_not_dict(self, tmp_path):
+        """Test error when conc is not a dict"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': [4, 8, 16]}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc' must be a dict"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_conc_missing_start(self, tmp_path):
+        """Test error when conc.start is missing"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'end': 8}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc' missing required field 'start'"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_conc_missing_end(self, tmp_path):
+        """Test error when conc.end is missing"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'start': 4}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc' missing required field 'end'"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_conc_start_not_int(self, tmp_path):
+        """Test error when conc.start is not an int"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'start': '4', 'end': 8}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc.start' must be an int"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_conc_end_not_int(self, tmp_path):
+        """Test error when conc.end is not an int"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'start': 4, 'end': '8'}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc.end' must be an int"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_conc_start_greater_than_end(self, tmp_path):
+        """Test error when conc.start > conc.end"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'start': 16, 'end': 4}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc.start' must be <= 'conc.end'"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_step_not_int(self, tmp_path):
+        """Test error when step is not an int"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'start': 4, 'end': 16, 'step': '2'}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc.step' must be an int"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+    
+    def test_step_not_greater_than_one(self, tmp_path):
+        """Test error when step <= 1"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 2, 'conc': {'start': 4, 'end': 16, 'step': 1}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        with pytest.raises(AssertionError, match="'conc.step' must be > 1"):
+            flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+
+
+class TestConcurrencyGeneration:
+    """Test concurrency value generation logic"""
+    
+    def test_geometric_progression(self, tmp_path):
+        """Test that concurrency values follow geometric progression"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 1, 'conc': {'start': 4, 'end': 64, 'step': 2}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        result = flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+        
+        conc_values = [entry['conc'] for entry in result]
+        assert conc_values == [4, 8, 16, 32, 64]
+    
+    def test_single_conc_value(self, tmp_path):
+        """Test when start equals end"""
+        config = {
+            'gptoss': {
+                'fp4': {
+                    'h100': {
+                        '1k1k': [
+                            {'tp': 1, 'conc': {'start': 64, 'end': 64}}
+                        ]
+                    }
+                }
+            }
+        }
+        config_path = tmp_path / "config.yml"
+        with open(config_path, 'w') as f:
+            yaml.dump(config, f)
+        
+        result = flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4')
+        
+        assert len(result) == 1
+        assert result[0]['conc'] == 64
+
+
+if __name__ == '__main__':
+    pytest.main([__file__, '-v'])
\ No newline at end of file