diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md new file mode 100644 index 000000000..9021ec83e --- /dev/null +++ b/.github/configs/CONFIGS.md @@ -0,0 +1,80 @@ +# Search Space Configuration + +This file defines which benchmark configurations to run for each model, GPU, and precision combination. + +## Quick Start + +Add a new configuration by following this pattern: +```yaml +model_name: + precision: + gpu_type: + sequence_length: + - { tp: [1, 2, 4], conc: { start: 4, end: 64 } } +``` + +## Field Definitions + +### Required Fields + +- **`tp`**: Tensor Parallelism (number of GPUs) + - Single value: `tp: 4` + - Multiple values: `tp: [2, 4, 8]` + +- **`conc`**: Concurrency (number of simultaneous requests) + - `start`: First value to test + - `end`: Last value to test + - `step`: Multiplier (default: 2) + - Example: `{start: 4, end: 64}` → tests [4, 8, 16, 32, 64] + +### Optional Fields + +- **`ep`**: Expert Parallelism for MoE models (default: 1) + +- **`dp_attention`**: Data Parallel Attention (default: `"false"`) + +## Examples + +### Basic configuration +```yaml +gptoss: + fp4: + h100: + 1k1k: # 1024 input, 1024 output + - { tp: [2, 4, 8], conc: { start: 4, end: 64 } } +``` +This tests 15 combinations: 3 TP values × 5 concurrency values + +### Configuration with optional fields +```yaml +dsr1: + fp4: + b200-trt: + 1k1k: + - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 256, end: 256 } } +``` + +### Custom step factor +```yaml +llama: + fp8: + b200: + 1k8k: + - { tp: 2, conc: { start: 4, end: 64, step: 4 } } +``` +This tests [4, 16, 64] (multiplies by 4 instead of default 2) + +## Key Points + +1. **Models**: `gptoss`, `llama`, `dsr1` +2. **Precisions**: `fp4`, `fp8` +3. **Sequence lengths**: `1k1k`, `1k8k`, `8k1k` (input×output) +4. Each entry expands to test all combinations of TP and concurrency values +5. There are comments throughout the yaml that were ported over from bash scripts describing what parallelism settings should be set depending on concurrency -- keep an eye out for those. + +## Testing Your Changes + +Run the flattening script to validate: +```bash +python utils/flatten_matrix.py +``` \ No newline at end of file diff --git a/.github/configs/search-space.yml b/.github/configs/search-space.yml new file mode 100644 index 000000000..f89643d14 --- /dev/null +++ b/.github/configs/search-space.yml @@ -0,0 +1,425 @@ +gptoss: + fp4: + h100: + 1k1k: + - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: [ 8 ], conc: { start: 4, end: 32 } } + h200: + 1k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: 2, conc: { start: 4, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 1, conc: { start: 4, end: 16 } } + - { tp: 2, conc: { start: 4, end: 64 } } + - { tp: 4, conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: [ 1, 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 32 } } + h200-trt: + 1k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: 2, conc: { start: 4, end: 64 } } + - { tp: 4, conc: { start: 4, end: 32 } } + - { tp: 8, conc: { start: 4, end: 8 } } + 1k8k: + - { tp: 1, conc: { start: 32, end: 64 } } + - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } } + b200: + 1k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 8 } } + 1k8k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 8 } } + 8k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } } + b200-trt: + # NOTE: Regardless of TP, if CONC >= 256, DP_ATTENTION should be set + # to true and EP should be set to TP, i.e., + # For lower concurrencies (CONC < 256), use TP Attention; Switch to + # MoE Expert parallel for conurrency >=16 (1k1k and 1k8k) + 1k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: 2, conc: { start: 4, end: 64 } } + # EP=4 iff TP=4 and CONC >= 16 + - { tp: 4, conc: { start: 4, end: 8 } } + - { tp: 4, ep: 4, conc: { start: 16, end: 64 } } + # EP=8 iff TP=8 and CONC >= 16 + - { tp: 8, conc: { start: 4, end: 8 } } + 1k8k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: 2, conc: { start: 4, end: 64 } } + # EP=4 iff TP=4 and CONC >= 16 + - { tp: 4, conc: { start: 4, end: 8 } } + - { tp: 4, ep: 4, conc: { start: 16, end: 64 } } + # EP=8 iff TP=8 and CONC >= 16 + - { tp: 8, conc: { start: 4, end: 8 } } + 8k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 8 } } + mi300x: + 1k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + 1k8k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + 8k1k: + - { tp: [ 1, 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + mi325x: + 1k1k: + - { tp: [ 1, 2, 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 1, 4 ], conc: { start: 64, end: 64 } } + - { tp: [ 2, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 8 } } + - { tp: 8, conc: { start: 4, end: 16 } } + mi355x: + 1k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 16 } } + 1k8k: + - { tp: [ 1, 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 16 } } + +llama: + fp4: + b200: + # fix: add TP=2,4 to B200, just as mi355 has + 1k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: 2, conc: { start: 32, end: 64 } } + - { tp: 4, conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + 1k8k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: 2, conc: { start: 32, end: 64 } } + - { tp: 4, conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 32 } } + 8k1k: + - { tp: 1, conc: { start: 16, end: 64 } } + - { tp: 2, conc: { start: 16, end: 64 } } + - { tp: 4, conc: { start: 4, end: 32 } } + - { tp: 8, conc: { start: 4, end: 8 } } + b200-trt: + # fix: add TP=2,4 to B200, just as mi355 has + # B200 can achieve TPS/User >= 30 with larger concurrency till 128 + 1k1k: + - { tp: 1, conc: { start: 128, end: 128 } } + - { tp: 2, conc: { start: 64, end: 128 } } + - { tp: 4, conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + 1k8k: + - { tp: 1, conc: { start: 128, end: 128 } } + - { tp: 2, conc: { start: 64, end: 128 } } + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 32 } } + 8k1k: + - { tp: 1, conc: { start: 32, end: 128 } } + - { tp: 2, conc: { start: 16, end: 128 } } + - { tp: 4, conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + mi355x: + 1k1k: + - { tp: 1, conc: { start: 32, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + 1k8k: + - { tp: 1, conc: { start: 32, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + 8k1k: + - { tp: 1, conc: { start: 32, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + fp8: + h100: + 1k1k: + - { tp: 2, conc: { start: 64, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 2, conc: { start: 64, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 2, conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + h200: + 1k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: 2, conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 1, 2 ], conc: { start: 64, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: [ 1, 2 ], conc: { start: 16, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + h200-trt: + # H200 can achieve TPS/User >= 30 with larger concurrency till 128 + 1k1k: + - { tp: 1, conc: { start: 128, end: 128 } } + - { tp: 2, conc: { start: 64, end: 128 } } + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 32 } } + 1k8k: + - { tp: 1, conc: { start: 128, end: 128 } } + - { tp: 2, conc: { start: 64, end: 128 } } + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 32 } } + 8k1k: + - { tp: [ 1, 2 ], conc: { start: 16, end: 128 } } + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 32 } } + b200: + # fix: add TP=2,4 to B200, just as mi355 has + 1k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: 2, conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 1, 2 ], conc: { start: 64, end: 64 } } + - { tp: 4, conc: { start: 16, end: 64 } } + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 1, conc: { start: 32, end: 64 } } + - { tp: 2, conc: { start: 16, end: 64 } } + - { tp: 4, conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 32 } } + b200-trt: + # fix: add TP=2,4 to B200, just as mi355 has + # B200 can achieve TPS/User >= 30 with larger concurrency till 256 + 1k1k: + - { tp: 1, conc: { start: 128, end: 128 } } + - { tp: 2, conc: { start: 64, end: 128 } } + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 32 } } + 1k8k: + - { tp: 1, conc: { start: 128, end: 128 } } + - { tp: 2, conc: { start: 64, end: 128 } } + - { tp: 4, conc: { start: 16, end: 128 } } + - { tp: 8, conc: { start: 4, end: 32 } } + 8k1k: + - { tp: 1, conc: { start: 32, end: 128 } } + - { tp: 2, conc: { start: 16, end: 128 } } + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 16 } } + mi300x: + 1k1k: + - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 1, 2 ], conc: { start: 64, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + mi325x: + 1k1k: + - { tp: [ 1, 2, 4 ], conc: { start: 32, end: 64 } } + - { tp: 8, conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } } + - { tp: 4, conc: { start: 64, end: 64 } } + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 1, conc: { start: 16, end: 64 } } + - { tp: 2, conc: { start: 4, end: 32 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + mi355x: + 1k1k: + - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + +dsr1: + fp4: + b200: + 1k1k: + - { tp: [ 4, 8 ], conc: { start: 4, end: 128 } } + 1k8k: + - { tp: [ 4, 8 ], conc: { start: 4, end: 128 } } + 8k1k: + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 16 } } + b200-trt: + # Determine DP_ATTENTION, and EP_SIZE based on ISL, OSL, CONC + # For ISL/OSL = 1k/1k + 1k1k: + # If TP=4, + # EP_SIZE=4 iff CONC > 32 + # DP_ATTENTION=true iff CONC >= 256 + - { tp: 4, conc: { start: 4, end: 32 } } + - { tp: 4, ep: 4, conc: { start: 64, end: 128 } } + - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 256, end: 256 } } + # If TP=8, + # EP_SIZE=8 iff CONC > 8 + # DP_ATTENTION=true iff CONC >= 256 + - { tp: 8, conc: { start: 4, end: 8 } } + - { tp: 8, ep: 8, conc: { start: 16, end: 128 } } + - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 256, end: 256 } } + # For ISL/OSL = 1k/8k + 1k8k: + # If TP=4, + # EP_SIZE=4 iff CONC > 32 + # DP_ATTENTION=true iff CONC >= 256 + - { tp: 4, conc: { start: 4, end: 32 } } + - { tp: 4, ep: 4, conc: { start: 64, end: 128 } } + - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 256, end: 256 } } + # If TP=8, + # EP_SIZE=8 iff CONC > 16 + # DP_ATTENTION=true iff CONC >= 256 + - { tp: 8, conc: { start: 4, end: 16 } } + - { tp: 8, ep: 8, conc: { start: 32, end: 128 } } + - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 256, end: 256 } } + # For ISL/OSL = 8k/1k + 8k1k: + # If TP=4, + # EP_SIZE=4 and DP_ATTENTION=true iff CONC > 32 + - { tp: 4, conc: { start: 4, end: 32 } } + - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 64, end: 256 } } + # If TP=8, + # EP_SIZE=8 and DP_ATTENTION=true iff CONC > 32 + - { tp: 8, conc: { start: 4, end: 32 } } + - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 256 } } + gb200: + 1k1k: + - { tp: 12, conc: { start: 4300, end: 4300 } } + - { tp: 24, conc: { start: 4300, end: 4300 } } + - { tp: 24, conc: { start: 2048, end: 2048 } } + - { tp: 20, conc: { start: 1075, end: 1075 } } + - { tp: 36, conc: { start: 1075, end: 1075 } } + - { tp: 36, conc: { start: 564, end: 564 } } + - { tp: 36, conc: { start: 4, end: 256 } } + 8k1k: + - { tp: 28, conc: { start: 2150, end: 2150 } } + - { tp: 48, conc: { start: 2150, end: 2150 } } + - { tp: 40, conc: { start: 1075, end: 1075 } } + - { tp: 48, conc: { start: 538, end: 538 } } + - { tp: 48, conc: { start: 256, end: 256 } } + - { tp: 28, conc: { start: 102, end: 102 } } + - { tp: 28, conc: { start: 3, end: 48 } } + gb200-mtp: + 1k1k: + - { tp: 12, conc: { start: 2252, end: 2252 } } + - { tp: 24, conc: { start: 2150, end: 2150 } } + - { tp: 20, conc: { start: 1075, end: 1075 } } + - { tp: 20, conc: { start: 512, end: 512 } } + - { tp: 36, conc: { start: 512, end: 512 } } + - { tp: 36, conc: { start: 144, end: 144 } } + - { tp: 36, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 32, conc: { start: 2150, end: 2150 } } + - { tp: 48, conc: { start: 1075, end: 1075 } } + - { tp: 64, conc: { start: 538, end: 538 } } + - { tp: 52, conc: { start: 269, end: 269 } } + - { tp: 52, conc: { start: 128, end: 128 } } + - { tp: 28, conc: { start: 54, end: 54 } } + - { tp: 28, conc: { start: 3, end: 24 } } + mi355x: + 1k1k: + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + fp8: + h200: + 1k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + h200-trt: + # Determine DP_ATTENTION and EP_SIZE based on ISL, OSL, CONC + # If ISL/OSL = 1k/1k, DP_ATTENTION=true iff CONC > 64 + 1k1k: + - { tp: 8, ep: 8, conc: { start: 4, end: 64 } } + # If ISL/OSL = 1k/8k, DP_ATTENTION=true iff CONC > 64 + 1k8k: + - { tp: 8, ep: 8, conc: { start: 4, end: 64 } } + # If ISL/OSL = 8k/1k, DP_ATTENTION=true iff CONC > 32 + 8k1k: + - { tp: 8, ep: 8, conc: { start: 4, end: 32 } } + - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 64 } } + b200: + 1k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + b200-trt: + # Determine DP_ATTENTION and EP_SIZE based on ISL, OSL, CONC + # If ISL/OSL = 1k/1k, DP_ATTENTION=true iff CONC > 32 + 1k1k: + - { tp: 8, ep: 8, conc: { start: 4, end: 32 } } + - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 64 } } + # If ISL/OSL = 1k/8k, DP_ATTENTION=true iff CONC > 64 + 1k8k: + - { tp: 8, ep: 8, conc: { start: 4, end: 64 } } + # If ISL/OSL = 8k/1k, DP_ATTENTION=true iff CONC > 64 + 8k1k: + - { tp: 8, ep: 8, conc: { start: 4, end: 64 } } + gb200: + 1k1k: + - { tp: 72, conc: { start: 8192, end: 8192 } } + - { tp: 72, conc: { start: 6144, end: 6144 } } + - { tp: 72, conc: { start: 5632, end: 5632 } } + - { tp: 72, conc: { start: 5376, end: 5376 } } + - { tp: 72, conc: { start: 5120, end: 5120 } } + - { tp: 72, conc: { start: 4992, end: 4992 } } + - { tp: 72, conc: { start: 4864, end: 4864 } } + - { tp: 72, conc: { start: 4608, end: 4608 } } + - { tp: 72, conc: { start: 1024, end: 4096 } } + 8k1k: + - { tp: 72, conc: { start: 128, end: 4096 } } + - { tp: 72, conc: { start: 576, end: 576 } } + - { tp: 72, conc: { start: 448, end: 448 } } + - { tp: 72, conc: { start: 384, end: 384 } } + mi300x: + 1k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + mi325x: + 1k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + mi355x: + 1k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, conc: { start: 4, end: 64 } } diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index 3d1dd5051..cb9776b64 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -54,7 +54,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[2, 4, 8]' bmk-h200-fp8: if: ${{ inputs.use_h200 }} @@ -71,7 +70,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' bmk-h200-trt-fp8: if: ${{ inputs.use_h200 }} @@ -88,8 +86,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' - conc-list: '[4, 8, 16, 32, 64, 128]' # H200 can achieve TPS/User >= 30 with larger concurrency till 128 bmk-b200-fp8: if: ${{ inputs.use_b200 }} @@ -106,7 +102,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has bmk-b200-trt-fp8: if: ${{ inputs.use_b200 }} @@ -123,8 +118,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has - conc-list: '[4, 8, 16, 32, 64, 128]' # B200 can achieve TPS/User >= 30 with larger concurrency till 256 bmk-mi300x-fp8: if: ${{ inputs.use_mi300x }} @@ -141,7 +134,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' bmk-mi325x-fp8: if: ${{ inputs.use_mi325x }} @@ -158,7 +150,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' bmk-mi355x-fp8: if: ${{ inputs.use_mi355x }} @@ -175,7 +166,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' bmk-b200-fp4: if: ${{ inputs.use_b200 }} @@ -192,7 +182,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has bmk-b200-trt-fp4: if: ${{ inputs.use_b200 }} @@ -209,8 +198,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has - conc-list: '[4, 8, 16, 32, 64, 128]' # B200 can achieve TPS/User >= 30 with larger concurrency till 128 bmk-mi355x-fp4: if: ${{ inputs.use_mi355x }} @@ -226,5 +213,4 @@ jobs: isl: ${{ inputs.isl }} osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' + random-range-ratio: ${{ inputs.random-range-ratio }} \ No newline at end of file diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 07f5b876d..b5bcc5817 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -84,6 +84,8 @@ jobs: exit 1 fi + # NOTE: https://github.com/InferenceMAX/InferenceMAX/pull/111 adds EP_SIZE and DP_ATTENTION parsing to the process_results.py script + # but it is not yet implemented for GB200 multi-node, therefore just default to: 1 "false" - name: Process results run: | # Process each result file @@ -93,7 +95,7 @@ jobs: # Extract GPU count from filename for tp_size calculation gpus=$(echo "$result_file" | sed "s/.*_gpus\([0-9]*\)\.json/\1/") if [ -n "$gpus" ]; then - python3 utils/process_result.py ${{ inputs.runner }} $gpus ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE + python3 utils/process_result.py ${{ inputs.runner }} $gpus 1 "false" ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE fi fi done diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 313087946..2eaaa7edd 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -32,17 +32,12 @@ on: random-range-ratio: required: true type: string - tp-list: - required: true - type: string - conc-list: - type: string - default: '[4, 8, 16, 32, 64]' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_CACHE: '/mnt/hf_hub_cache/' EXP_NAME: ${{ inputs.exp-name }} + RUNNER: ${{ inputs.runner }} MODEL: ${{ inputs.model }} ISL: ${{ inputs.isl }} OSL: ${{ inputs.osl }} @@ -53,20 +48,38 @@ env: PRECISION: ${{ inputs.precision }} jobs: + flatten-search-space-matrix: + name: get search space ${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} + timeout-minutes: 180 + runs-on: ubuntu-latest + outputs: + flattened-matrix: ${{ steps.flatten.outputs.flattened-matrix }} + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - id: flatten + run: python3 ${GITHUB_WORKSPACE}/utils/flatten_matrix.py + benchmark: runs-on: ${{ inputs.runner }} + needs: flatten-search-space-matrix timeout-minutes: 180 strategy: fail-fast: false matrix: - tp: ${{ fromJson(inputs.tp-list) }} - conc: ${{ fromJson(inputs.conc-list) }} - name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.tp }} conc${{ matrix.conc }}' + config: ${{ fromJson(needs.flatten-search-space-matrix.outputs.flattened-matrix) }} + + name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}' env: - TP: ${{ matrix.tp }} - CONC: ${{ matrix.conc }} + TP: ${{ matrix.config.tp }} + CONC: ${{ matrix.config.conc }} + EP_SIZE: ${{ matrix.config.ep || 1 }} + DP_ATTENTION: ${{ matrix.config.dp_attention || 'false' }} steps: - name: Resource cleanup @@ -139,7 +152,7 @@ jobs: - name: Process result run: | - python3 utils/process_result.py ${{ inputs.runner }} $TP $RESULT_FILENAME $FRAMEWORK $PRECISION + python3 utils/process_result.py ${{ inputs.runner }} $TP $EP_SIZE $DP_ATTENTION $RESULT_FILENAME $FRAMEWORK $PRECISION - name: Upload result uses: actions/upload-artifact@v4 diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml index 3a48710f2..c8b7fd64d 100644 --- a/.github/workflows/dsr1-tmpl.yml +++ b/.github/workflows/dsr1-tmpl.yml @@ -40,22 +40,21 @@ on: default: false jobs: - bmk-h200-fp8: - if: ${{ inputs.use_h200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: h200 - image: 'lmsysorg/sglang:v0.5.2rc2-cu126' - model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'sglang' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' + # bmk-h200-fp8: + # if: ${{ inputs.use_h200 }} + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # runner: h200 + # image: 'lmsysorg/sglang:v0.5.2rc2-cu126' + # model: 'deepseek-ai/DeepSeek-R1-0528' + # framework: 'sglang' + # precision: 'fp8' + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} bmk-h200-trt-fp8: if: ${{ inputs.use_h200 }} @@ -72,7 +71,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' bmk-b200-fp8: if: ${{ inputs.use_b200 }} @@ -89,7 +87,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' bmk-b200-trt-fp8: if: ${{ inputs.use_b200 }} @@ -106,7 +103,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' bmk-mi300x-fp8: if: ${{ inputs.use_mi300x }} @@ -123,7 +119,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' bmk-mi325x-fp8: if: ${{ inputs.use_mi325x }} @@ -140,7 +135,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' bmk-mi355x-fp8: if: ${{ inputs.use_mi355x }} @@ -157,7 +151,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' bmk-b200-fp4: if: ${{ inputs.use_b200 }} @@ -174,8 +167,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[4,8]' - conc-list: '[4, 8, 16, 32, 64, 128]' # Custom concurrency values for this job bmk-b200-trt-fp4: if: ${{ inputs.use_b200 }} @@ -192,8 +183,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[4, 8]' - conc-list: '[4, 8, 16, 32, 64, 128, 256]' # DPA4EP4 is already 30 tok/s/user and DPA8EP8 is already 35tok/s/user. 512 conc would be too much so we skipping it bmk-mi355x-fp4: if: ${{ inputs.use_mi355x }} @@ -210,8 +199,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - # These tensor parallelism settings are not necessary as they cannot fall on the Pareto frontier with this particular container - we remove them to save CI time. - tp-list: ${{ inputs.isl == 1024 && inputs.osl == 1024 && '[4, 8]' || '[8]' }} bmk-gb200-fp4-multinode-mtp-off: if: ${{ inputs.use_gb200 && !(inputs.isl == '1024' && inputs.osl == '8192') }} diff --git a/.github/workflows/gptoss-tmpl.yml b/.github/workflows/gptoss-tmpl.yml index 0c505de07..03a205ff5 100644 --- a/.github/workflows/gptoss-tmpl.yml +++ b/.github/workflows/gptoss-tmpl.yml @@ -52,7 +52,6 @@ jobs: runner: h100 image: 'vllm/vllm-openai:v0.10.2' model: 'openai/gpt-oss-120b' - tp-list: '[2, 4, 8]' framework: 'vllm' precision: 'fp4' @@ -69,7 +68,6 @@ jobs: runner: h200 image: 'vllm/vllm-openai:v0.10.2' model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' framework: 'vllm' precision: 'fp4' @@ -86,7 +84,6 @@ jobs: runner: b200 image: 'vllm/vllm-openai:v0.10.2' model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' framework: 'vllm' precision: 'fp4' @@ -103,7 +100,6 @@ jobs: runner: b200-trt image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' framework: 'trt' precision: 'fp4' @@ -120,7 +116,6 @@ jobs: runner: h200-trt image: 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' framework: 'trt' precision: 'fp4' @@ -137,7 +132,6 @@ jobs: runner: mi300x image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' framework: 'vllm' precision: 'fp4' @@ -154,7 +148,6 @@ jobs: runner: mi325x image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' framework: 'vllm' precision: 'fp4' @@ -171,6 +164,5 @@ jobs: runner: mi355x image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' model: 'openai/gpt-oss-120b' - tp-list: '[1, 4, 8]' framework: 'vllm' precision: 'fp4' diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index ffdae541c..d13584078 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -13,69 +13,50 @@ # CONC # RESULT_FILENAME # PORT_OFFSET +# EP_SIZE +# DP_ATTENTION echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +# Default +MOE_BACKEND="TRTLLM" -hf download $MODEL +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="1" -MOE_BACKEND="TRTLLM" -DP_ATTENTION=false +hf download $MODEL +# ========= Determine MOE_BACKEND based on ISL, OSL, CONC ========= if [[ "$TP" == "4" ]]; then if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 32 ]]; then - EP_SIZE="$TP" - fi if [[ $CONC -ge 256 ]]; then - DP_ATTENTION=true MOE_BACKEND="CUTLASS" fi elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -gt 32 ]]; then - EP_SIZE="$TP" - fi if [[ $CONC -ge 256 ]]; then - DP_ATTENTION=true MOE_BACKEND="CUTLASS" fi elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then if [[ $CONC -gt 32 ]]; then - EP_SIZE="$TP" - DP_ATTENTION=true MOE_BACKEND="CUTLASS" fi fi elif [[ "$TP" == "8" ]]; then if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 8 ]]; then - EP_SIZE="$TP" - fi if [[ $CONC -ge 256 ]]; then - DP_ATTENTION=true MOE_BACKEND="CUTLASS" fi elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -gt 16 ]]; then - EP_SIZE="$TP" - fi if [[ $CONC -ge 256 ]]; then - DP_ATTENTION=true MOE_BACKEND="CUTLASS" fi elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then if [[ $CONC -gt 32 ]]; then - EP_SIZE="$TP" - DP_ATTENTION=true MOE_BACKEND="CUTLASS" fi fi fi -echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" +echo "MOE_BACKEND set to '$MOE_BACKEND'" SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index e909b954a..6bc8c9fa7 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -13,33 +13,16 @@ # CONC # RESULT_FILENAME # PORT_OFFSET +# EP_SIZE +# DP_ATTENTION echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" - -hf download $MODEL - -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="$TP" MOE_BACKEND="DEEPGEMM" -DP_ATTENTION=false -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 32 ]]; then - DP_ATTENTION=true - fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -gt 64 ]]; then - DP_ATTENTION=true - fi -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 64 ]]; then - DP_ATTENTION=true - fi -fi +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" -echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" +hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 20101e466..5dfdf8617 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -13,33 +13,16 @@ # CONC # RESULT_FILENAME # PORT_OFFSET +# EP_SIZE +# DP_ATTENTION echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" - -hf download $MODEL - -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="$TP" MOE_BACKEND="CUTLASS" -DP_ATTENTION=false -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 64 ]]; then - DP_ATTENTION=true - fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -gt 64 ]]; then - DP_ATTENTION=true - fi -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 32 ]]; then - DP_ATTENTION=true - fi -fi +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND: $MOE_BACKEND" -echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" +hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 96745306a..4f17d4d4f 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -13,48 +13,31 @@ # CONC # RESULT_FILENAME # PORT_OFFSET +# EP_SIZE +# DP_ATTENTION # GPTOSS TRTLLM Deployment Guide: # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION" hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="1" +# ========= Determine MOE_BACKEND based on ISL, OSL, CONC ========= +# Default MOE_BACKEND="TRTLLM" -DP_ATTENTION=false - -# Lower concurrencies: Concurrency < 256 -# MoE backend=TRTLLM -# Use TP Attention; Switch to MoE Expert parallel for conurrency >=16 (1k1k and 1k8k) -TEP_REQUIRED=false -if [[ "$TP" == "4" || "$TP" == "8" ]]; then - if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - TEP_REQUIRED=true - elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - TEP_REQUIRED=true - fi -fi -if [[ "$TEP_REQUIRED" == "true" && $CONC -ge 16 ]]; then - EP_SIZE="$TP" -fi # Higher concurrencies: Concurrency >= 256 # MoE Backend = CUTLASS -# Use DP attention with expert parallel MoE if [[ $CONC -ge 256 ]]; then - EP_SIZE="$TP" - DP_ATTENTION=true MOE_BACKEND="CUTLASS" fi -echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" +echo "MOE_BACKEND set to $MOE_BACKEND" EXTRA_CONFIG_FILE="gptoss-fp4.yml" export TRTLLM_ENABLE_PDL=1 diff --git a/utils/count_num_jobs.py b/utils/count_num_jobs.py new file mode 100644 index 000000000..776fd2a65 --- /dev/null +++ b/utils/count_num_jobs.py @@ -0,0 +1,37 @@ +import yaml +from collections import defaultdict + +with open('.github/configs/search-space.yml', 'r') as f: + data = yaml.safe_load(f) + +gpu_totals = defaultdict(int) +overall_total = 0 + +for model in data.values(): + for precision in model.values(): + for gpu, runner_data in precision.items(): + for seq_len in runner_data.values(): + for entry in seq_len: + # Count TP values + tp_list = entry['tp'] if isinstance(entry['tp'], list) else [entry['tp']] + tp_count = len(tp_list) + + # Count CONC values + conc = entry['conc'] + start, end = conc['start'], conc['end'] + step = conc.get('step', 2) + + conc_count = 0 + current = start + while current <= end: + conc_count += 1 + current *= step + + combo_count = tp_count * conc_count + gpu_totals[gpu] += combo_count + overall_total += combo_count + +print("Breakdown by GPU:") +for gpu in sorted(gpu_totals.keys()): + print(f" {gpu}: {gpu_totals[gpu]}") +print(f"\nTotal combinations: {overall_total}") \ No newline at end of file diff --git a/utils/flatten_matrix.py b/utils/flatten_matrix.py new file mode 100644 index 000000000..7c75058d7 --- /dev/null +++ b/utils/flatten_matrix.py @@ -0,0 +1,98 @@ +import yaml +import os +import json + +def flatten_search_space(config_path, runner, model, isl, osl, precision): + with open(config_path, 'r') as f: + search_space = yaml.safe_load(f) + + seq_len_map = { + '1024': '1k', + '8192': '8k', + } + + model_map = { + 'gpt-oss': 'gptoss', + 'llama-3.3-70b-instruct': 'llama', + 'deepseek-r1-0528': 'dsr1', + } + + model_key = None + model_lower = model.lower() + for key, value in model_map.items(): + if key.lower() in model_lower: + model_key = value + break + + assert model_key, f"model '{model}' not recognized" + + assert seq_len_map.get(isl) and seq_len_map.get(osl), f"either isl or osl not recognized" + seq_len = f"{seq_len_map[isl]}{seq_len_map[osl]}" + + assert search_space.get(model_key, {}).get(precision), f"precision '{precision}' not recognized" + assert search_space.get(model_key, {}).get(precision).get(runner), f"runner '{runner}' not recognized" + + entries = search_space.get(model_key, {}).get( + precision).get(runner, {}).get(seq_len, []) + + flattened_search_space = [] + for entry in entries: + assert entry.get('tp'), f"entry malformed, expecting field 'tp'" + tp = entry.get('tp') + assert isinstance(tp, int) or (isinstance(tp, list) and all(isinstance(x, int) for x in tp)), \ + f"entry malformed, expecting field 'tp' to be either an int or list of ints" + + tp_list = entry['tp'] if isinstance(entry['tp'], list) else [entry['tp']] + + conc_config = entry.get('conc') + + assert conc_config, f"entry malformed, missing field 'conc'" + assert isinstance(conc_config, dict), f"entry malformed, 'conc' must be a dict" + assert 'start' in conc_config, f"entry malformed, 'conc' missing required field 'start'" + assert 'end' in conc_config, f"entry malformed, 'conc' missing required field 'end'" + assert isinstance(conc_config['start'], int), f"entry malformed, 'conc.start' must be an int" + assert isinstance(conc_config['end'], int), f"entry malformed, 'conc.end' must be an int" + assert conc_config['start'] <= conc_config['end'], f"entry malformed, 'conc.start' must be <= 'conc.end'" + + start = conc_config['start'] + end = conc_config['end'] + step_factor = conc_config.get('step', 2) + + if 'step' in conc_config: + assert isinstance(step_factor, int), f"entry malformed, 'conc.step' must be an int" + assert step_factor > 1, f"entry malformed, 'conc.step' must be > 1" + + conc_list = [] + current = start + while current <= end: + conc_list.append(current) + current *= step_factor + + for tp_value in tp_list: + for conc_value in conc_list: + new_entry = entry.copy() + new_entry['tp'] = tp_value + new_entry['conc'] = conc_value + flattened_search_space.append(new_entry) + + return flattened_search_space + + +def main(): + config_path = '.github/configs/search-space.yml' + runner = os.environ['RUNNER'] + model = os.environ['MODEL'] + isl = os.environ['ISL'] + osl = os.environ['OSL'] + precision = os.environ['PRECISION'] + + flattened_search_space = flatten_search_space( + config_path, runner, model, isl, osl, precision + ) + + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write(f"flattened-matrix={json.dumps(flattened_search_space)}\n") + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/process_result.py b/utils/process_result.py index 89c4aa7b3..a59d1f7f3 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -5,9 +5,11 @@ hw = sys.argv[1] tp_size = int(sys.argv[2]) -result_filename = sys.argv[3] -framework = sys.argv[4] -precision = sys.argv[5] +ep_size = int(sys.argv[3]) +dp_attention = sys.argv[4] +result_filename = sys.argv[5] +framework = sys.argv[6] +precision = sys.argv[7] with open(f'{result_filename}.json') as f: bmk_result = json.load(f) @@ -15,7 +17,9 @@ data = { 'hw': hw, 'tp': tp_size, + 'ep': ep_size, 'conc': int(bmk_result['max_concurrency']), + 'dp_attention': dp_attention, # true or false 'model': bmk_result['model_id'], 'framework': framework, 'precision': precision, @@ -23,8 +27,8 @@ 'output_tput_per_gpu': float(bmk_result['output_throughput']) / tp_size } -if len(sys.argv) == 7: # MTP - data['mtp'] = sys.argv[6] +if len(sys.argv) == 9: # MTP + data['mtp'] = sys.argv[8] for key, value in bmk_result.items(): if key.endswith('ms'): diff --git a/utils/summarize.py b/utils/summarize.py index 1f78caf9c..de8863c78 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -9,11 +9,11 @@ with open(result_path) as f: result = json.load(f) results.append(result) -results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['conc'])) +results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['ep'], r['conc'])) summary_header = f'''\ -| Hardware | Framework | Precision | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | -| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ +| Hardware | Framework | Precision | TP | EP | Conc | DP Attention | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | +| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' print(summary_header) @@ -25,7 +25,9 @@ f"| {framework.upper()} " f"| {precision.upper()} " f"| {result['tp']} " + f"| {result['ep']} " f"| {result['conc']} " + f"| {result['dp_attention']} " f"| {(result['median_ttft'] * 1000):.4f} " f"| {(result['median_tpot'] * 1000):.4f} " f"| {result['median_e2el']:.4f} " diff --git a/utils/test_flatten_matrix.py b/utils/test_flatten_matrix.py new file mode 100644 index 000000000..b4f0620ad --- /dev/null +++ b/utils/test_flatten_matrix.py @@ -0,0 +1,469 @@ +import pytest +import yaml +from flatten_matrix import flatten_search_space + + +@pytest.fixture +def minimal_config(): + """Minimal valid config for testing""" + return { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': [2, 4], 'conc': {'start': 4, 'end': 8}} + ] + } + } + }, + 'llama': { + 'fp8': { + 'b200': { + '1k8k': [ + {'tp': 2, 'conc': {'start': 4, 'end': 64, 'step': 4}} + ] + } + } + }, + 'dsr1': { + 'fp4': { + 'b200-trt': { + '8k1k': [ + {'tp': 4, 'conc': {'start': 4, 'end': 32}}, + {'tp': 4, 'ep': 4, 'dp_attention': 'true', 'conc': {'start': 64, 'end': 256}} + ] + } + } + } + } + + +@pytest.fixture +def config_file(minimal_config, tmp_path): + # temp config file + config_path = tmp_path / "search-space.yml" + with open(config_path, 'w') as f: + yaml.dump(minimal_config, f) + return config_path + + +class TestValidCases: + """Test valid input scenarios""" + + def test_single_tp_value(self, config_file): + """Test with single TP value""" + result = flatten_search_space( + config_file, 'b200', 'llama-3.3-70b-instruct', '1024', '8192', 'fp8' + ) + + # Should generate: tp=2, conc=[4, 16, 64] with step=4 + assert len(result) == 3 + assert all(entry['tp'] == 2 for entry in result) + assert [entry['conc'] for entry in result] == [4, 16, 64] + + def test_list_of_tp_values(self, config_file): + """Test with list of TP values""" + result = flatten_search_space( + config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp4' + ) + + # Should generate: tp=[2,4], conc=[4,8] = 2*2 = 4 combinations + assert len(result) == 4 + tp_values = [entry['tp'] for entry in result] + assert tp_values.count(2) == 2 + assert tp_values.count(4) == 2 + + def test_optional_fields_preserved(self, config_file): + """Test that optional fields like ep and dp_attention are preserved""" + result = flatten_search_space( + config_file, 'b200-trt', 'deepseek-r1-0528', '8192', '1024', 'fp4' + ) + + # Second entry should have ep and dp_attention + entries_with_ep = [e for e in result if 'ep' in e] + assert len(entries_with_ep) > 0 + assert all(e['ep'] == 4 for e in entries_with_ep) + + entries_with_dp = [e for e in result if 'dp_attention' in e] + assert len(entries_with_dp) > 0 + assert all(e['dp_attention'] == 'true' for e in entries_with_dp) + + def test_default_step_factor(self, config_file): + """Test that default step factor of 2 is used when not specified""" + result = flatten_search_space( + config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp4' + ) + + # conc: start=4, end=8, default step=2 -> [4, 8] + conc_values = sorted(set(entry['conc'] for entry in result)) + assert conc_values == [4, 8] + + def test_custom_step_factor(self, config_file): + """Test custom step factor""" + result = flatten_search_space( + config_file, 'b200', 'llama-3.3-70b-instruct', '1024', '8192', 'fp8' + ) + + # conc: start=4, end=64, step=4 -> [4, 16, 64] + conc_values = sorted(set(entry['conc'] for entry in result)) + assert conc_values == [4, 16, 64] + + +class TestModelMapping: + """Test model name mapping""" + + def test_gptoss_mapping(self, config_file): + """Test gpt-oss maps to gptoss""" + result = flatten_search_space( + config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp4' + ) + assert len(result) > 0 + + def test_llama_mapping(self, config_file): + """Test llama mapping with case insensitivity""" + result = flatten_search_space( + config_file, 'b200', 'LLAMA-3.3-70B-INSTRUCT', '1024', '8192', 'fp8' + ) + assert len(result) > 0 + + def test_dsr1_mapping(self, config_file): + """Test deepseek-r1 maps to dsr1""" + result = flatten_search_space( + config_file, 'b200-trt', 'deepseek-r1-0528', '8192', '1024', 'fp4' + ) + assert len(result) > 0 + + +class TestInvalidInputs: + """Test error handling for invalid inputs""" + + def test_unrecognized_model(self, config_file): + """Test error for unrecognized model""" + with pytest.raises(AssertionError, match="model .* not recognized"): + flatten_search_space( + config_file, 'h100', 'unknown-model', '1024', '1024', 'fp4' + ) + + def test_invalid_isl(self, config_file): + """Test error for invalid ISL""" + with pytest.raises(AssertionError, match="either isl or osl not recognized"): + flatten_search_space( + config_file, 'h100', 'gpt-oss', '2048', '1024', 'fp4' + ) + + def test_invalid_osl(self, config_file): + """Test error for invalid OSL""" + with pytest.raises(AssertionError, match="either isl or osl not recognized"): + flatten_search_space( + config_file, 'h100', 'gpt-oss', '1024', '4096', 'fp4' + ) + + def test_invalid_precision(self, config_file): + """Test error for invalid precision""" + with pytest.raises(AssertionError, match="precision .* not recognized"): + flatten_search_space( + config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp16' + ) + + def test_invalid_runner(self, config_file): + """Test error for invalid runner""" + with pytest.raises(AssertionError, match="runner .* not recognized"): + flatten_search_space( + config_file, 'a100', 'gpt-oss', '1024', '1024', 'fp4' + ) + + +class TestMalformedEntries: + """Test validation of malformed config entries""" + + def test_missing_tp_field(self, tmp_path): + """Test error when tp field is missing""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'conc': {'start': 4, 'end': 8}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="entry malformed, expecting field 'tp'"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_tp_wrong_type(self, tmp_path): + """Test error when tp is wrong type""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 'invalid', 'conc': {'start': 4, 'end': 8}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="expecting field 'tp' to be either an int or list of ints"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_tp_list_with_non_ints(self, tmp_path): + """Test error when tp list contains non-integers""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': [2, 'four', 8], 'conc': {'start': 4, 'end': 8}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="expecting field 'tp' to be either an int or list of ints"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_missing_conc_field(self, tmp_path): + """Test error when conc field is missing""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="entry malformed, missing field 'conc'"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_conc_not_dict(self, tmp_path): + """Test error when conc is not a dict""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': [4, 8, 16]} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc' must be a dict"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_conc_missing_start(self, tmp_path): + """Test error when conc.start is missing""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'end': 8}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc' missing required field 'start'"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_conc_missing_end(self, tmp_path): + """Test error when conc.end is missing""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'start': 4}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc' missing required field 'end'"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_conc_start_not_int(self, tmp_path): + """Test error when conc.start is not an int""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'start': '4', 'end': 8}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc.start' must be an int"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_conc_end_not_int(self, tmp_path): + """Test error when conc.end is not an int""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'start': 4, 'end': '8'}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc.end' must be an int"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_conc_start_greater_than_end(self, tmp_path): + """Test error when conc.start > conc.end""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'start': 16, 'end': 4}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc.start' must be <= 'conc.end'"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_step_not_int(self, tmp_path): + """Test error when step is not an int""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'start': 4, 'end': 16, 'step': '2'}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc.step' must be an int"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_step_not_greater_than_one(self, tmp_path): + """Test error when step <= 1""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'start': 4, 'end': 16, 'step': 1}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc.step' must be > 1"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + +class TestConcurrencyGeneration: + """Test concurrency value generation logic""" + + def test_geometric_progression(self, tmp_path): + """Test that concurrency values follow geometric progression""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 1, 'conc': {'start': 4, 'end': 64, 'step': 2}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + result = flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + conc_values = [entry['conc'] for entry in result] + assert conc_values == [4, 8, 16, 32, 64] + + def test_single_conc_value(self, tmp_path): + """Test when start equals end""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 1, 'conc': {'start': 64, 'end': 64}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + result = flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + assert len(result) == 1 + assert result[0]['conc'] == 64 + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) \ No newline at end of file