From 35bc387fffe898ea47d190c2f3ebd8c06d568d96 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 16 Oct 2025 19:31:22 -0500 Subject: [PATCH 01/17] adding static config defining parallelism search spaces for all runs --- .github/configs/search-space.yml | 382 ++++++++++++++++++++++++ .github/workflows/70b-tmpl.yml | 16 +- .github/workflows/benchmark-tmpl.yml | 36 ++- .github/workflows/dsr1-tmpl.yml | 13 - .github/workflows/gptoss-tmpl.yml | 8 - benchmarks/dsr1_fp4_b200_trt_slurm.sh | 122 ++++---- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 46 +-- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 46 +-- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 50 ++-- utils/flatten_matrix.py | 73 +++++ 10 files changed, 629 insertions(+), 163 deletions(-) create mode 100644 .github/configs/search-space.yml create mode 100644 utils/flatten_matrix.py diff --git a/.github/configs/search-space.yml b/.github/configs/search-space.yml new file mode 100644 index 000000000..7c1564909 --- /dev/null +++ b/.github/configs/search-space.yml @@ -0,0 +1,382 @@ +gptoss: + fp4: + h100: + 1k1k: + - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: [ 8 ], conc: { start: 4, end: 32 } } + h200: + 1k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: 2, conc: { start: 4, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 1, conc: { start: 4, end: 16 } } + - { tp: 2, conc: { start: 4, end: 64 } } + - { tp: 4, conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: [ 1, 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 32 } } + h200-trt: + 1k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: 2, conc: { start: 4, end: 64 } } + - { tp: 4, conc: { start: 4, end: 32 } } + - { tp: 8, conc: { start: 4, end: 8 } } + 1k8k: + - { tp: 1, conc: { start: 32, end: 64 } } + - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } } + b200: + 1k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 8 } } + 1k8k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 8 } } + 8k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } } + b200-trt: + 1k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: 2, conc: { start: 4, end: 64 } } + - { tp: 4, conc: { start: 4, end: 8 } } + - { tp: 4, ep: 4, conc: { start: 16, end: 64 } } + - { tp: 8, conc: { start: 4, end: 8 } } + 1k8k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: 2, conc: { start: 4, end: 64 } } + - { tp: 4, conc: { start: 4, end: 8 } } + - { tp: 4, ep: 4, conc: { start: 16, end: 64 } } + - { tp: 8, conc: { start: 4, end: 8 } } + 8k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 8 } } + mi300x: + 1k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + 1k8k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + 8k1k: + - { tp: [ 1, 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + mi325x: + 1k1k: + - { tp: [ 1, 2, 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 1, 4 ], conc: { start: 64, end: 64 } } + - { tp: [ 2, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 8 } } + - { tp: 8, conc: { start: 4, end: 16 } } + mi355x: + 1k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 16 } } + 1k8k: + - { tp: [ 1, 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 1, conc: { start: 4, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 16 } } + +llama: + fp4: + b200: + 1k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: 2, conc: { start: 32, end: 64 } } + - { tp: 4, conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + 1k8k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: 2, conc: { start: 32, end: 64 } } + - { tp: 4, conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 32 } } + 8k1k: + - { tp: 1, conc: { start: 16, end: 64 } } + - { tp: 2, conc: { start: 16, end: 64 } } + - { tp: 4, conc: { start: 4, end: 32 } } + - { tp: 8, conc: { start: 4, end: 8 } } + b200-trt: + 1k1k: + - { tp: 1, conc: { start: 128, end: 128 } } + - { tp: 2, conc: { start: 64, end: 128 } } + - { tp: 4, conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + 1k8k: + - { tp: 1, conc: { start: 128, end: 128 } } + - { tp: 2, conc: { start: 64, end: 128 } } + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 32 } } + 8k1k: + - { tp: 1, conc: { start: 32, end: 128 } } + - { tp: 2, conc: { start: 16, end: 128 } } + - { tp: 4, conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + mi355x: + 1k1k: + - { tp: 1, conc: { start: 32, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + 1k8k: + - { tp: 1, conc: { start: 32, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + 8k1k: + - { tp: 1, conc: { start: 32, end: 64 } } + - { tp: [ 2, 4 ], conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 16 } } + fp8: + h100: + 1k1k: + - { tp: 2, conc: { start: 64, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 2, conc: { start: 64, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 2, conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + h200: + 1k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: 2, conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 1, 2 ], conc: { start: 64, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: [ 1, 2 ], conc: { start: 16, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + h200-trt: + 1k1k: + - { tp: 1, conc: { start: 128, end: 128 } } + - { tp: 2, conc: { start: 64, end: 128 } } + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 32 } } + 1k8k: + - { tp: 1, conc: { start: 128, end: 128 } } + - { tp: 2, conc: { start: 64, end: 128 } } + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 32 } } + 8k1k: + - { tp: [ 1, 2 ], conc: { start: 16, end: 128 } } + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 32 } } + b200: + 1k1k: + - { tp: 1, conc: { start: 64, end: 64 } } + - { tp: 2, conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 1, 2 ], conc: { start: 64, end: 64 } } + - { tp: 4, conc: { start: 16, end: 64 } } + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 1, conc: { start: 32, end: 64 } } + - { tp: 2, conc: { start: 16, end: 64 } } + - { tp: 4, conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 32 } } + b200-trt: + 1k1k: + - { tp: 1, conc: { start: 128, end: 128 } } + - { tp: 2, conc: { start: 64, end: 128 } } + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 32 } } + 1k8k: + - { tp: 1, conc: { start: 128, end: 128 } } + - { tp: 2, conc: { start: 64, end: 128 } } + - { tp: 4, conc: { start: 16, end: 128 } } + - { tp: 8, conc: { start: 4, end: 32 } } + 8k1k: + - { tp: 1, conc: { start: 32, end: 128 } } + - { tp: 2, conc: { start: 16, end: 128 } } + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 16 } } + mi300x: + 1k1k: + - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 1, 2 ], conc: { start: 64, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + mi325x: + 1k1k: + - { tp: [ 1, 2, 4 ], conc: { start: 32, end: 64 } } + - { tp: 8, conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } } + - { tp: 4, conc: { start: 64, end: 64 } } + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 1, conc: { start: 16, end: 64 } } + - { tp: 2, conc: { start: 4, end: 32 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + mi355x: + 1k1k: + - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 8k1k: + - { tp: [ 1, 2 ], conc: { start: 32, end: 64 } } + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + +dsr1: + fp4: + b200: + 1k1k: + - { tp: [ 4, 8 ], conc: { start: 4, end: 128 } } + 1k8k: + - { tp: [ 4, 8 ], conc: { start: 4, end: 128 } } + 8k1k: + - { tp: 4, conc: { start: 4, end: 128 } } + - { tp: 8, conc: { start: 4, end: 16 } } + b200-trt: + 1k1k: + - { tp: 4, conc: { start: 4, end: 32 } } + - { tp: 4, ep: 4, conc: { start: 64, end: 128 } } + - { tp: 4, ep: 4, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } } + - { tp: 8, conc: { start: 4, end: 8 } } + - { tp: 8, ep: 8, conc: { start: 16, end: 128 } } + - { tp: 8, ep: 8, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } } + 1k8k: + - { tp: 4, conc: { start: 4, end: 32 } } + - { tp: 4, ep: 4, conc: { start: 64, end: 128 } } + - { tp: 4, ep: 4, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } } + - { tp: 8, conc: { start: 4, end: 16 } } + - { tp: 8, ep: 8, conc: { start: 32, end: 128 } } + - { tp: 8, ep: 8, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } } + 8k1k: + - { tp: 4, conc: { start: 4, end: 32 } } + - { tp: 4, ep: 4, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 64, end: 256 } } + - { tp: 8, conc: { start: 4, end: 32 } } + - { tp: 8, ep: 8, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 64, end: 256 } } + gb200: + 1k1k: + - { tp: 12, conc: { start: 4300, end: 4300 } } + - { tp: 24, conc: { start: 4300, end: 4300 } } + - { tp: 24, conc: { start: 2048, end: 2048 } } + - { tp: 20, conc: { start: 1075, end: 1075 } } + - { tp: 36, conc: { start: 1075, end: 1075 } } + - { tp: 36, conc: { start: 564, end: 564 } } + - { tp: 36, conc: { start: 4, end: 256 } } + 8k1k: + - { tp: 28, conc: { start: 2150, end: 2150 } } + - { tp: 48, conc: { start: 2150, end: 2150 } } + - { tp: 40, conc: { start: 1075, end: 1075 } } + - { tp: 48, conc: { start: 538, end: 538 } } + - { tp: 48, conc: { start: 256, end: 256 } } + - { tp: 28, conc: { start: 102, end: 102 } } + - { tp: 28, conc: { start: 3, end: 48 } } + gb200-mtp: + 1k1k: + - { tp: 12, conc: { start: 2252, end: 2252 } } + - { tp: 24, conc: { start: 2150, end: 2150 } } + - { tp: 20, conc: { start: 1075, end: 1075 } } + - { tp: 20, conc: { start: 512, end: 512 } } + - { tp: 36, conc: { start: 512, end: 512 } } + - { tp: 36, conc: { start: 144, end: 144 } } + - { tp: 36, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 32, conc: { start: 2150, end: 2150 } } + - { tp: 48, conc: { start: 1075, end: 1075 } } + - { tp: 64, conc: { start: 538, end: 538 } } + - { tp: 52, conc: { start: 269, end: 269 } } + - { tp: 52, conc: { start: 128, end: 128 } } + - { tp: 28, conc: { start: 54, end: 54 } } + - { tp: 28, conc: { start: 3, end: 24 } } + mi355x: + 1k1k: + - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + fp8: + h200: + 1k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + h200-trt: + 1k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + - { tp: 8, conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + b200: + 1k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + b200-trt: + 1k1k: + - { tp: 8, ep: 8, conc: { start: 4, end: 32 } } + 1k8k: + - { tp: 8, ep: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, ep: 8, conc: { start: 4, end: 32 } } + - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 64 } } + gb200: + 1k1k: + - { tp: 72, conc: { start: 8192, end: 8192 } } + - { tp: 72, conc: { start: 6144, end: 6144 } } + - { tp: 72, conc: { start: 5632, end: 5632 } } + - { tp: 72, conc: { start: 5376, end: 5376 } } + - { tp: 72, conc: { start: 5120, end: 5120 } } + - { tp: 72, conc: { start: 4992, end: 4992 } } + - { tp: 72, conc: { start: 4864, end: 4864 } } + - { tp: 72, conc: { start: 4608, end: 4608 } } + - { tp: 72, conc: { start: 1024, end: 4096 } } + 8k1k: + - { tp: 72, conc: { start: 128, end: 4096 } } + - { tp: 72, conc: { start: 576, end: 576 } } + - { tp: 72, conc: { start: 448, end: 448 } } + - { tp: 72, conc: { start: 384, end: 384 } } + mi300x: + 1k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + mi325x: + 1k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + mi355x: + 1k1k: + - { tp: 8, conc: { start: 4, end: 64 } } + 1k8k: + - { tp: 8, conc: { start: 4, end: 64 } } + 8k1k: + - { tp: 8, conc: { start: 4, end: 64 } } diff --git a/.github/workflows/70b-tmpl.yml b/.github/workflows/70b-tmpl.yml index 3d1dd5051..cb9776b64 100644 --- a/.github/workflows/70b-tmpl.yml +++ b/.github/workflows/70b-tmpl.yml @@ -54,7 +54,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[2, 4, 8]' bmk-h200-fp8: if: ${{ inputs.use_h200 }} @@ -71,7 +70,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' bmk-h200-trt-fp8: if: ${{ inputs.use_h200 }} @@ -88,8 +86,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' - conc-list: '[4, 8, 16, 32, 64, 128]' # H200 can achieve TPS/User >= 30 with larger concurrency till 128 bmk-b200-fp8: if: ${{ inputs.use_b200 }} @@ -106,7 +102,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has bmk-b200-trt-fp8: if: ${{ inputs.use_b200 }} @@ -123,8 +118,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has - conc-list: '[4, 8, 16, 32, 64, 128]' # B200 can achieve TPS/User >= 30 with larger concurrency till 256 bmk-mi300x-fp8: if: ${{ inputs.use_mi300x }} @@ -141,7 +134,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' bmk-mi325x-fp8: if: ${{ inputs.use_mi325x }} @@ -158,7 +150,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' bmk-mi355x-fp8: if: ${{ inputs.use_mi355x }} @@ -175,7 +166,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' bmk-b200-fp4: if: ${{ inputs.use_b200 }} @@ -192,7 +182,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has bmk-b200-trt-fp4: if: ${{ inputs.use_b200 }} @@ -209,8 +198,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' # fix: add TP=2,4 to B200, just as mi355 has - conc-list: '[4, 8, 16, 32, 64, 128]' # B200 can achieve TPS/User >= 30 with larger concurrency till 128 bmk-mi355x-fp4: if: ${{ inputs.use_mi355x }} @@ -226,5 +213,4 @@ jobs: isl: ${{ inputs.isl }} osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[1, 2, 4, 8]' + random-range-ratio: ${{ inputs.random-range-ratio }} \ No newline at end of file diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 313087946..03d007f5c 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -32,17 +32,12 @@ on: random-range-ratio: required: true type: string - tp-list: - required: true - type: string - conc-list: - type: string - default: '[4, 8, 16, 32, 64]' env: HF_TOKEN: ${{ secrets.HF_TOKEN }} HF_HUB_CACHE: '/mnt/hf_hub_cache/' EXP_NAME: ${{ inputs.exp-name }} + RUNNER: ${{ inputs.runner }} MODEL: ${{ inputs.model }} ISL: ${{ inputs.isl }} OSL: ${{ inputs.osl }} @@ -53,22 +48,41 @@ env: PRECISION: ${{ inputs.precision }} jobs: + flatten-search-space-matrix: + runs-on: ubuntu-latest + outputs: + flattened-matrix: ${{ steps.flatten.outputs.flattened-matrix }} + steps: + - uses: actions/checkout@v3 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - id: flatten + run: python3 ${GITHUB_WORKSPACE}/utils/flatten_matrix.py + benchmark: runs-on: ${{ inputs.runner }} + needs: flatten-search-space-matrix timeout-minutes: 180 strategy: fail-fast: false matrix: - tp: ${{ fromJson(inputs.tp-list) }} - conc: ${{ fromJson(inputs.conc-list) }} - name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.tp }} conc${{ matrix.conc }}' + config: ${{ fromJson(needs.flatten-search-space-matrix.outputs.flattened-matrix) }} + name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.config.tp }} conc${{ matrix.config.conc }}' env: - TP: ${{ matrix.tp }} - CONC: ${{ matrix.conc }} + TP: ${{ matrix.config.tp }} + CONC: ${{ matrix.config.conc }} + EP_SIZE: ${{ matrix.config.ep || 1 }} + DP_ATTENTION: ${{ matrix.config.dp_attention || 'false' }} + MOE_BACKEND: ${{ matrix.config.moe_backend || 'NONE' }} steps: + - name: debug + run: echo "${{ matrix.config }}" + - name: Resource cleanup run: | if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml index 3a48710f2..64dfe4ec4 100644 --- a/.github/workflows/dsr1-tmpl.yml +++ b/.github/workflows/dsr1-tmpl.yml @@ -55,7 +55,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' bmk-h200-trt-fp8: if: ${{ inputs.use_h200 }} @@ -72,7 +71,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' bmk-b200-fp8: if: ${{ inputs.use_b200 }} @@ -89,7 +87,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' bmk-b200-trt-fp8: if: ${{ inputs.use_b200 }} @@ -106,7 +103,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' bmk-mi300x-fp8: if: ${{ inputs.use_mi300x }} @@ -123,7 +119,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' bmk-mi325x-fp8: if: ${{ inputs.use_mi325x }} @@ -140,7 +135,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' bmk-mi355x-fp8: if: ${{ inputs.use_mi355x }} @@ -157,7 +151,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[8]' bmk-b200-fp4: if: ${{ inputs.use_b200 }} @@ -174,8 +167,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[4,8]' - conc-list: '[4, 8, 16, 32, 64, 128]' # Custom concurrency values for this job bmk-b200-trt-fp4: if: ${{ inputs.use_b200 }} @@ -192,8 +183,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - tp-list: '[4, 8]' - conc-list: '[4, 8, 16, 32, 64, 128, 256]' # DPA4EP4 is already 30 tok/s/user and DPA8EP8 is already 35tok/s/user. 512 conc would be too much so we skipping it bmk-mi355x-fp4: if: ${{ inputs.use_mi355x }} @@ -210,8 +199,6 @@ jobs: osl: ${{ inputs.osl }} max-model-len: ${{ inputs.max-model-len }} random-range-ratio: ${{ inputs.random-range-ratio }} - # These tensor parallelism settings are not necessary as they cannot fall on the Pareto frontier with this particular container - we remove them to save CI time. - tp-list: ${{ inputs.isl == 1024 && inputs.osl == 1024 && '[4, 8]' || '[8]' }} bmk-gb200-fp4-multinode-mtp-off: if: ${{ inputs.use_gb200 && !(inputs.isl == '1024' && inputs.osl == '8192') }} diff --git a/.github/workflows/gptoss-tmpl.yml b/.github/workflows/gptoss-tmpl.yml index 0c505de07..03a205ff5 100644 --- a/.github/workflows/gptoss-tmpl.yml +++ b/.github/workflows/gptoss-tmpl.yml @@ -52,7 +52,6 @@ jobs: runner: h100 image: 'vllm/vllm-openai:v0.10.2' model: 'openai/gpt-oss-120b' - tp-list: '[2, 4, 8]' framework: 'vllm' precision: 'fp4' @@ -69,7 +68,6 @@ jobs: runner: h200 image: 'vllm/vllm-openai:v0.10.2' model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' framework: 'vllm' precision: 'fp4' @@ -86,7 +84,6 @@ jobs: runner: b200 image: 'vllm/vllm-openai:v0.10.2' model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' framework: 'vllm' precision: 'fp4' @@ -103,7 +100,6 @@ jobs: runner: b200-trt image: 'nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2' model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' framework: 'trt' precision: 'fp4' @@ -120,7 +116,6 @@ jobs: runner: h200-trt image: 'nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev' model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' framework: 'trt' precision: 'fp4' @@ -137,7 +132,6 @@ jobs: runner: mi300x image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' framework: 'vllm' precision: 'fp4' @@ -154,7 +148,6 @@ jobs: runner: mi325x image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' model: 'openai/gpt-oss-120b' - tp-list: '[1, 2, 4, 8]' framework: 'vllm' precision: 'fp4' @@ -171,6 +164,5 @@ jobs: runner: mi355x image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' model: 'openai/gpt-oss-120b' - tp-list: '[1, 4, 8]' framework: 'vllm' precision: 'fp4' diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index ffdae541c..dbffe8494 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -13,69 +13,77 @@ # CONC # RESULT_FILENAME # PORT_OFFSET +# EP_SIZE +# DP_ATTENTION +# MOE_BACKEND echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +# Default backend is TRTLLM +if [[ "MOE_BACKEND" == "NONE" ]]; then + MOE_BACKEND="TRTLLM" +fi -hf download $MODEL +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="1" -MOE_BACKEND="TRTLLM" -DP_ATTENTION=false - -if [[ "$TP" == "4" ]]; then - if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 32 ]]; then - EP_SIZE="$TP" - fi - if [[ $CONC -ge 256 ]]; then - DP_ATTENTION=true - MOE_BACKEND="CUTLASS" - fi - elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -gt 32 ]]; then - EP_SIZE="$TP" - fi - if [[ $CONC -ge 256 ]]; then - DP_ATTENTION=true - MOE_BACKEND="CUTLASS" - fi - elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 32 ]]; then - EP_SIZE="$TP" - DP_ATTENTION=true - MOE_BACKEND="CUTLASS" - fi - fi -elif [[ "$TP" == "8" ]]; then - if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 8 ]]; then - EP_SIZE="$TP" - fi - if [[ $CONC -ge 256 ]]; then - DP_ATTENTION=true - MOE_BACKEND="CUTLASS" - fi - elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -gt 16 ]]; then - EP_SIZE="$TP" - fi - if [[ $CONC -ge 256 ]]; then - DP_ATTENTION=true - MOE_BACKEND="CUTLASS" - fi - elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 32 ]]; then - EP_SIZE="$TP" - DP_ATTENTION=true - MOE_BACKEND="CUTLASS" - fi - fi -fi +hf download $MODEL -echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" +# # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= +# EP_SIZE="1" +# MOE_BACKEND="TRTLLM" +# DP_ATTENTION=false + +# if [[ "$TP" == "4" ]]; then +# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then +# if [[ $CONC -gt 32 ]]; then +# EP_SIZE="$TP" +# fi +# if [[ $CONC -ge 256 ]]; then +# DP_ATTENTION=true +# MOE_BACKEND="CUTLASS" +# fi +# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then +# if [[ $CONC -gt 32 ]]; then +# EP_SIZE="$TP" +# fi +# if [[ $CONC -ge 256 ]]; then +# DP_ATTENTION=true +# MOE_BACKEND="CUTLASS" +# fi +# elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then +# if [[ $CONC -gt 32 ]]; then +# EP_SIZE="$TP" +# DP_ATTENTION=true +# MOE_BACKEND="CUTLASS" +# fi +# fi +# elif [[ "$TP" == "8" ]]; then +# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then +# if [[ $CONC -gt 8 ]]; then +# EP_SIZE="$TP" +# fi +# if [[ $CONC -ge 256 ]]; then +# DP_ATTENTION=true +# MOE_BACKEND="CUTLASS" +# fi +# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then +# if [[ $CONC -gt 16 ]]; then +# EP_SIZE="$TP" +# fi +# if [[ $CONC -ge 256 ]]; then +# DP_ATTENTION=true +# MOE_BACKEND="CUTLASS" +# fi +# elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then +# if [[ $CONC -gt 32 ]]; then +# EP_SIZE="$TP" +# DP_ATTENTION=true +# MOE_BACKEND="CUTLASS" +# fi +# fi +# fi + +# echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index e909b954a..67dbb8396 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -13,31 +13,39 @@ # CONC # RESULT_FILENAME # PORT_OFFSET +# EP_SIZE +# DP_ATTENTION +# MOE_BACKEND echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +# Default backend is DEEPGEMM +if [[ "MOE_BACKEND" == "NONE" ]]; then + MOE_BACKEND="DEEPGEMM" +fi -hf download $MODEL +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="$TP" -MOE_BACKEND="DEEPGEMM" -DP_ATTENTION=false +hf download $MODEL -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 32 ]]; then - DP_ATTENTION=true - fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -gt 64 ]]; then - DP_ATTENTION=true - fi -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 64 ]]; then - DP_ATTENTION=true - fi -fi +# # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= +# EP_SIZE="$TP" +# MOE_BACKEND="DEEPGEMM" +# DP_ATTENTION=false + +# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then +# if [[ $CONC -gt 32 ]]; then +# DP_ATTENTION=true +# fi +# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then +# if [[ $CONC -gt 64 ]]; then +# DP_ATTENTION=true +# fi +# elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then +# if [[ $CONC -gt 64 ]]; then +# DP_ATTENTION=true +# fi +# fi echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 20101e466..a71068867 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -13,31 +13,39 @@ # CONC # RESULT_FILENAME # PORT_OFFSET +# EP_SIZE +# DP_ATTENTION +# MOE_BACKEND echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +# Default backend is CUTLASS +if [[ "MOE_BACKEND" == "NONE" ]]; then + MOE_BACKEND="CUTLASS" +fi + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" hf download $MODEL # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="$TP" -MOE_BACKEND="CUTLASS" -DP_ATTENTION=false - -if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 64 ]]; then - DP_ATTENTION=true - fi -elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - if [[ $CONC -gt 64 ]]; then - DP_ATTENTION=true - fi -elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ $CONC -gt 32 ]]; then - DP_ATTENTION=true - fi -fi +# EP_SIZE="$TP" +# MOE_BACKEND="CUTLASS" +# DP_ATTENTION=false + +# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then +# if [[ $CONC -gt 64 ]]; then +# DP_ATTENTION=true +# fi +# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then +# if [[ $CONC -gt 64 ]]; then +# DP_ATTENTION=true +# fi +# elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then +# if [[ $CONC -gt 32 ]]; then +# DP_ATTENTION=true +# fi +# fi echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" @@ -49,7 +57,7 @@ cat > $EXTRA_CONFIG_FILE << EOF cuda_graph_config: enable_padding: true max_batch_size: 128 -enable_attention_dp: $DP_ATTENTION + enable_attention_dp: $DP_ATTENTION print_iter_log: true kv_cache_config: dtype: fp8 diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 96745306a..9ffa26659 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -13,48 +13,56 @@ # CONC # RESULT_FILENAME # PORT_OFFSET +# EP_SIZE +# DP_ATTENTION +# MOE_BACKEND # GPTOSS TRTLLM Deployment Guide: # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" +# Default backend is TRTLLM +if [[ "MOE_BACKEND" == "NONE" ]]; then + MOE_BACKEND="TRTLLM" +fi + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -EP_SIZE="1" -MOE_BACKEND="TRTLLM" -DP_ATTENTION=false +# EP_SIZE="1" +# MOE_BACKEND="TRTLLM" +# DP_ATTENTION=false # Lower concurrencies: Concurrency < 256 # MoE backend=TRTLLM # Use TP Attention; Switch to MoE Expert parallel for conurrency >=16 (1k1k and 1k8k) -TEP_REQUIRED=false -if [[ "$TP" == "4" || "$TP" == "8" ]]; then - if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then - TEP_REQUIRED=true - elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then - TEP_REQUIRED=true - fi -fi -if [[ "$TEP_REQUIRED" == "true" && $CONC -ge 16 ]]; then - EP_SIZE="$TP" -fi +# TEP_REQUIRED=false +# if [[ "$TP" == "4" || "$TP" == "8" ]]; then +# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then +# TEP_REQUIRED=true +# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then +# TEP_REQUIRED=true +# fi +# fi +# if [[ "$TEP_REQUIRED" == "true" && $CONC -ge 16 ]]; then +# EP_SIZE="$TP" +# fi # Higher concurrencies: Concurrency >= 256 # MoE Backend = CUTLASS # Use DP attention with expert parallel MoE -if [[ $CONC -ge 256 ]]; then - EP_SIZE="$TP" - DP_ATTENTION=true - MOE_BACKEND="CUTLASS" -fi +# if [[ $CONC -ge 256 ]]; then +# EP_SIZE="$TP" +# DP_ATTENTION=true +# MOE_BACKEND="CUTLASS" +# fi -echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" +# echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" EXTRA_CONFIG_FILE="gptoss-fp4.yml" export TRTLLM_ENABLE_PDL=1 diff --git a/utils/flatten_matrix.py b/utils/flatten_matrix.py new file mode 100644 index 000000000..a46bce549 --- /dev/null +++ b/utils/flatten_matrix.py @@ -0,0 +1,73 @@ +import yaml +import os +import json + +with open('.github/configs/search-space.yml', 'r') as f: + search_space = yaml.safe_load(f) + +seq_len_map = { + '1024': '1k', + '8192': '8k', +} + +runner = os.environ['RUNNER'] +model = os.environ['MODEL'] +isl = os.environ['ISL'] +osl = os.environ['OSL'] +precision = os.environ['PRECISION'] + +# In the workflows, not all model references are the same so do a sort +# of partial matching to map to the search space keys +model_map = { + 'gpt-oss': 'gptoss', + 'llama-3.3-70b-instruct': 'llama', + 'deepseek-r1-0528': 'dsr1', +} + +model_key = None +model_lower = model.lower() +for key, value in model_map.items(): + if key.lower() in model_lower: + model_key = value + break + +if model_key is None: + raise ValueError(f"Model '{model}' is not recognized.") + +seq_len = f"{seq_len_map[isl]}{seq_len_map[osl]}" + +if isinstance(search_space, list): + entries = search_space +else: + entries = search_space.get(model_key, {}).get( + precision, {}).get(runner, {}).get(seq_len, []) + +flattened_search_space = [] +for entry in entries: + tp_list = entry['tp'] if isinstance( + entry.get('tp'), list) else [entry.get('tp')] + + conc_config = entry.get('conc') + if isinstance(conc_config, dict): + start = conc_config['start'] + end = conc_config['end'] + step_factor = conc_config.get('step', 2) + conc_list = [] + current = start + while current <= end: + conc_list.append(current) + current *= step_factor + elif isinstance(conc_config, list): + conc_list = conc_config + else: + conc_list = [conc_config] + + for tp_value in tp_list: + for conc_value in conc_list: + new_entry = entry.copy() + new_entry['tp'] = tp_value + new_entry['conc'] = conc_value + flattened_search_space.append(new_entry) + +with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write(f"flattened-matrix={json.dumps(flattened_search_space)}\n") From 683aba7eac106fb18308d7fbbf88abeda3673b9c Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 16 Oct 2025 23:04:12 -0500 Subject: [PATCH 02/17] getting rid of moe backend logic; adding comments --- .github/configs/search-space.yml | 60 ++++++++++++---- .github/workflows/benchmark-tmpl.yml | 1 - benchmarks/dsr1_fp4_b200_trt_slurm.sh | 92 ++++++++++--------------- benchmarks/dsr1_fp8_b200_trt_slurm.sh | 27 +------- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 29 +------- benchmarks/gptoss_fp4_b200_trt_slurm.sh | 43 +++--------- 6 files changed, 95 insertions(+), 157 deletions(-) diff --git a/.github/configs/search-space.yml b/.github/configs/search-space.yml index 7c1564909..34ed9f268 100644 --- a/.github/configs/search-space.yml +++ b/.github/configs/search-space.yml @@ -46,17 +46,25 @@ gptoss: - { tp: 1, conc: { start: 4, end: 64 } } - { tp: [ 2, 4, 8 ], conc: { start: 4, end: 64 } } b200-trt: + # NOTE: Regardless of TP, if CONC >= 256, DP_ATTENTION should be set + # to true and EP should be set to TP, i.e., + # For lower concurrencies (CONC < 256), use TP Attention; Switch to + # MoE Expert parallel for conurrency >=16 (1k1k and 1k8k) 1k1k: - { tp: 1, conc: { start: 64, end: 64 } } - { tp: 2, conc: { start: 4, end: 64 } } + # EP=4 iff TP=4 and CONC >= 16 - { tp: 4, conc: { start: 4, end: 8 } } - { tp: 4, ep: 4, conc: { start: 16, end: 64 } } + # EP=8 iff TP=8 and CONC >= 16 - { tp: 8, conc: { start: 4, end: 8 } } 1k8k: - { tp: 1, conc: { start: 64, end: 64 } } - { tp: 2, conc: { start: 4, end: 64 } } + # EP=4 iff TP=4 and CONC >= 16 - { tp: 4, conc: { start: 4, end: 8 } } - { tp: 4, ep: 4, conc: { start: 16, end: 64 } } + # EP=8 iff TP=8 and CONC >= 16 - { tp: 8, conc: { start: 4, end: 8 } } 8k1k: - { tp: 1, conc: { start: 64, end: 64 } } @@ -252,25 +260,45 @@ dsr1: - { tp: 4, conc: { start: 4, end: 128 } } - { tp: 8, conc: { start: 4, end: 16 } } b200-trt: + # Determine DP_ATTENTION, and EP_SIZE based on ISL, OSL, CONC + # For ISL/OSL = 1k/1k 1k1k: + # If TP=4, + # EP_SIZE=4 iff CONC > 32 + # DP_ATTENTION=true iff CONC >= 256 - { tp: 4, conc: { start: 4, end: 32 } } - { tp: 4, ep: 4, conc: { start: 64, end: 128 } } - - { tp: 4, ep: 4, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } } + - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 256, end: 256 } } + # If TP=8, + # EP_SIZE=8 iff CONC > 8 + # DP_ATTENTION=true iff CONC >= 256 - { tp: 8, conc: { start: 4, end: 8 } } - { tp: 8, ep: 8, conc: { start: 16, end: 128 } } - - { tp: 8, ep: 8, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } } + - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 256, end: 256 } } + # For ISL/OSL = 1k/8k 1k8k: + # If TP=4, + # EP_SIZE=4 iff CONC > 32 + # DP_ATTENTION=true iff CONC >= 256 - { tp: 4, conc: { start: 4, end: 32 } } - { tp: 4, ep: 4, conc: { start: 64, end: 128 } } - - { tp: 4, ep: 4, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } } + - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 256, end: 256 } } + # If TP=8, + # EP_SIZE=8 iff CONC > 16 + # DP_ATTENTION=true iff CONC >= 256 - { tp: 8, conc: { start: 4, end: 16 } } - { tp: 8, ep: 8, conc: { start: 32, end: 128 } } - - { tp: 8, ep: 8, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 256, end: 256 } } + - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 256, end: 256 } } + # For ISL/OSL = 8k/1k 8k1k: + # If TP=4, + # EP_SIZE=4 and DP_ATTENTION=true iff CONC > 32 - { tp: 4, conc: { start: 4, end: 32 } } - - { tp: 4, ep: 4, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 64, end: 256 } } + - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 64, end: 256 } } + # If TP=8, + # EP_SIZE=8 and DP_ATTENTION=true iff CONC > 32 - { tp: 8, conc: { start: 4, end: 32 } } - - { tp: 8, ep: 8, dp_attention: "true", moe_backend: "CUTLASS", conc: { start: 64, end: 256 } } + - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 256 } } gb200: 1k1k: - { tp: 12, conc: { start: 4300, end: 4300 } } @@ -321,13 +349,17 @@ dsr1: 8k1k: - { tp: 8, conc: { start: 4, end: 64 } } h200-trt: + # Determine DP_ATTENTION and EP_SIZE based on ISL, OSL, CONC + # If ISL/OSL = 1k/1k, DP_ATTENTION=true iff CONC > 64 1k1k: - - { tp: 8, conc: { start: 4, end: 64 } } - - { tp: 8, conc: { start: 4, end: 64 } } + - { tp: 8, ep: 8, conc: { start: 4, end: 64 } } + # If ISL/OSL = 1k/8k, DP_ATTENTION=true iff CONC > 64 1k8k: - - { tp: 8, conc: { start: 4, end: 64 } } + - { tp: 8, ep: 8, conc: { start: 4, end: 64 } } + # If ISL/OSL = 8k/1k, DP_ATTENTION=true iff CONC > 32 8k1k: - - { tp: 8, conc: { start: 4, end: 64 } } + - { tp: 8, ep: 8, conc: { start: 4, end: 32 } } + - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 64 } } b200: 1k1k: - { tp: 8, conc: { start: 4, end: 64 } } @@ -336,13 +368,17 @@ dsr1: 8k1k: - { tp: 8, conc: { start: 4, end: 64 } } b200-trt: + # Determine DP_ATTENTION and EP_SIZE based on ISL, OSL, CONC + # If ISL/OSL = 1k/1k, DP_ATTENTION=true iff CONC > 32 1k1k: - { tp: 8, ep: 8, conc: { start: 4, end: 32 } } + - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 64 } } + # If ISL/OSL = 1k/8k, DP_ATTENTION=true iff CONC > 64 1k8k: - { tp: 8, ep: 8, conc: { start: 4, end: 64 } } + # If ISL/OSL = 8k/1k, DP_ATTENTION=true iff CONC > 64 8k1k: - - { tp: 8, ep: 8, conc: { start: 4, end: 32 } } - - { tp: 8, ep: 8, dp_attention: "true", conc: { start: 64, end: 64 } } + - { tp: 8, ep: 8, conc: { start: 4, end: 64 } } gb200: 1k1k: - { tp: 72, conc: { start: 8192, end: 8192 } } diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 03d007f5c..a027bda8b 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -77,7 +77,6 @@ jobs: CONC: ${{ matrix.config.conc }} EP_SIZE: ${{ matrix.config.ep || 1 }} DP_ATTENTION: ${{ matrix.config.dp_attention || 'false' }} - MOE_BACKEND: ${{ matrix.config.moe_backend || 'NONE' }} steps: - name: debug diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index dbffe8494..215de86e5 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -15,7 +15,6 @@ # PORT_OFFSET # EP_SIZE # DP_ATTENTION -# MOE_BACKEND echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -28,62 +27,41 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO hf download $MODEL -# # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -# EP_SIZE="1" -# MOE_BACKEND="TRTLLM" -# DP_ATTENTION=false - -# if [[ "$TP" == "4" ]]; then -# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then -# if [[ $CONC -gt 32 ]]; then -# EP_SIZE="$TP" -# fi -# if [[ $CONC -ge 256 ]]; then -# DP_ATTENTION=true -# MOE_BACKEND="CUTLASS" -# fi -# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then -# if [[ $CONC -gt 32 ]]; then -# EP_SIZE="$TP" -# fi -# if [[ $CONC -ge 256 ]]; then -# DP_ATTENTION=true -# MOE_BACKEND="CUTLASS" -# fi -# elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then -# if [[ $CONC -gt 32 ]]; then -# EP_SIZE="$TP" -# DP_ATTENTION=true -# MOE_BACKEND="CUTLASS" -# fi -# fi -# elif [[ "$TP" == "8" ]]; then -# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then -# if [[ $CONC -gt 8 ]]; then -# EP_SIZE="$TP" -# fi -# if [[ $CONC -ge 256 ]]; then -# DP_ATTENTION=true -# MOE_BACKEND="CUTLASS" -# fi -# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then -# if [[ $CONC -gt 16 ]]; then -# EP_SIZE="$TP" -# fi -# if [[ $CONC -ge 256 ]]; then -# DP_ATTENTION=true -# MOE_BACKEND="CUTLASS" -# fi -# elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then -# if [[ $CONC -gt 32 ]]; then -# EP_SIZE="$TP" -# DP_ATTENTION=true -# MOE_BACKEND="CUTLASS" -# fi -# fi -# fi - -# echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" +# # ========= Determine MOE_BACKEND based on ISL, OSL, CONC ========= +# Default +MOE_BACKEND="TRTLLM" + +if [[ "$TP" == "4" ]]; then + if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + if [[ $CONC -ge 256 ]]; then + MOE_BACKEND="CUTLASS" + fi + elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + if [[ $CONC -ge 256 ]]; then + MOE_BACKEND="CUTLASS" + fi + elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ $CONC -gt 32 ]]; then + MOE_BACKEND="CUTLASS" + fi + fi +elif [[ "$TP" == "8" ]]; then + if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then + if [[ $CONC -ge 256 ]]; then + MOE_BACKEND="CUTLASS" + fi + elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then + if [[ $CONC -ge 256 ]]; then + MOE_BACKEND="CUTLASS" + fi + elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then + if [[ $CONC -gt 32 ]]; then + MOE_BACKEND="CUTLASS" + fi + fi +fi + +echo "MOE_BACKEND set to '$MOE_BACKEND'" SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) diff --git a/benchmarks/dsr1_fp8_b200_trt_slurm.sh b/benchmarks/dsr1_fp8_b200_trt_slurm.sh index 67dbb8396..6bc8c9fa7 100644 --- a/benchmarks/dsr1_fp8_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_b200_trt_slurm.sh @@ -15,40 +15,15 @@ # PORT_OFFSET # EP_SIZE # DP_ATTENTION -# MOE_BACKEND echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -# Default backend is DEEPGEMM -if [[ "MOE_BACKEND" == "NONE" ]]; then - MOE_BACKEND="DEEPGEMM" -fi +MOE_BACKEND="DEEPGEMM" echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" hf download $MODEL -# # ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -# EP_SIZE="$TP" -# MOE_BACKEND="DEEPGEMM" -# DP_ATTENTION=false - -# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then -# if [[ $CONC -gt 32 ]]; then -# DP_ATTENTION=true -# fi -# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then -# if [[ $CONC -gt 64 ]]; then -# DP_ATTENTION=true -# fi -# elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then -# if [[ $CONC -gt 64 ]]; then -# DP_ATTENTION=true -# fi -# fi - -echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" - SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) EXTRA_CONFIG_FILE="dsr1-fp8.yml" diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index a71068867..3552fac78 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -15,40 +15,15 @@ # PORT_OFFSET # EP_SIZE # DP_ATTENTION -# MOE_BACKEND echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -# Default backend is CUTLASS -if [[ "MOE_BACKEND" == "NONE" ]]; then - MOE_BACKEND="CUTLASS" -fi +MOE_BACKEND="CUTLASS" -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND: $MOE_BACKEND" hf download $MODEL -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -# EP_SIZE="$TP" -# MOE_BACKEND="CUTLASS" -# DP_ATTENTION=false - -# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then -# if [[ $CONC -gt 64 ]]; then -# DP_ATTENTION=true -# fi -# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then -# if [[ $CONC -gt 64 ]]; then -# DP_ATTENTION=true -# fi -# elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then -# if [[ $CONC -gt 32 ]]; then -# DP_ATTENTION=true -# fi -# fi - -echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" - SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) EXTRA_CONFIG_FILE="dsr1-fp8.yml" diff --git a/benchmarks/gptoss_fp4_b200_trt_slurm.sh b/benchmarks/gptoss_fp4_b200_trt_slurm.sh index 9ffa26659..4f17d4d4f 100644 --- a/benchmarks/gptoss_fp4_b200_trt_slurm.sh +++ b/benchmarks/gptoss_fp4_b200_trt_slurm.sh @@ -15,54 +15,29 @@ # PORT_OFFSET # EP_SIZE # DP_ATTENTION -# MOE_BACKEND # GPTOSS TRTLLM Deployment Guide: # https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -# Default backend is TRTLLM -if [[ "MOE_BACKEND" == "NONE" ]]; then - MOE_BACKEND="TRTLLM" -fi - -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION" hf download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=$(( 8888 + $PORT_OFFSET )) -# ========= Determine DP_ATTENTION, EP_SIZE and MOE_BACKEND based on ISL, OSL, CONC ========= -# EP_SIZE="1" -# MOE_BACKEND="TRTLLM" -# DP_ATTENTION=false - -# Lower concurrencies: Concurrency < 256 -# MoE backend=TRTLLM -# Use TP Attention; Switch to MoE Expert parallel for conurrency >=16 (1k1k and 1k8k) -# TEP_REQUIRED=false -# if [[ "$TP" == "4" || "$TP" == "8" ]]; then -# if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then -# TEP_REQUIRED=true -# elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then -# TEP_REQUIRED=true -# fi -# fi -# if [[ "$TEP_REQUIRED" == "true" && $CONC -ge 16 ]]; then -# EP_SIZE="$TP" -# fi +# ========= Determine MOE_BACKEND based on ISL, OSL, CONC ========= +# Default +MOE_BACKEND="TRTLLM" # Higher concurrencies: Concurrency >= 256 # MoE Backend = CUTLASS -# Use DP attention with expert parallel MoE -# if [[ $CONC -ge 256 ]]; then -# EP_SIZE="$TP" -# DP_ATTENTION=true -# MOE_BACKEND="CUTLASS" -# fi - -# echo "Final configuration: EP_SIZE='$EP_SIZE', MOE_BACKEND='$MOE_BACKEND', DP_ATTENTION='$DP_ATTENTION'" +if [[ $CONC -ge 256 ]]; then + MOE_BACKEND="CUTLASS" +fi + +echo "MOE_BACKEND set to $MOE_BACKEND" EXTRA_CONFIG_FILE="gptoss-fp4.yml" export TRTLLM_ENABLE_PDL=1 From a1f7b73cdb1591bc25f0934b912e5a4212ebadb3 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 16 Oct 2025 23:08:37 -0500 Subject: [PATCH 03/17] fixes; addming back explanatory comments --- .github/configs/search-space.yml | 7 +++++++ benchmarks/dsr1_fp4_b200_trt_slurm.sh | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/.github/configs/search-space.yml b/.github/configs/search-space.yml index 34ed9f268..f89643d14 100644 --- a/.github/configs/search-space.yml +++ b/.github/configs/search-space.yml @@ -105,6 +105,7 @@ gptoss: llama: fp4: b200: + # fix: add TP=2,4 to B200, just as mi355 has 1k1k: - { tp: 1, conc: { start: 64, end: 64 } } - { tp: 2, conc: { start: 32, end: 64 } } @@ -121,6 +122,8 @@ llama: - { tp: 4, conc: { start: 4, end: 32 } } - { tp: 8, conc: { start: 4, end: 8 } } b200-trt: + # fix: add TP=2,4 to B200, just as mi355 has + # B200 can achieve TPS/User >= 30 with larger concurrency till 128 1k1k: - { tp: 1, conc: { start: 128, end: 128 } } - { tp: 2, conc: { start: 64, end: 128 } } @@ -172,6 +175,7 @@ llama: - { tp: [ 1, 2 ], conc: { start: 16, end: 64 } } - { tp: [ 4, 8 ], conc: { start: 4, end: 64 } } h200-trt: + # H200 can achieve TPS/User >= 30 with larger concurrency till 128 1k1k: - { tp: 1, conc: { start: 128, end: 128 } } - { tp: 2, conc: { start: 64, end: 128 } } @@ -187,6 +191,7 @@ llama: - { tp: 4, conc: { start: 4, end: 128 } } - { tp: 8, conc: { start: 4, end: 32 } } b200: + # fix: add TP=2,4 to B200, just as mi355 has 1k1k: - { tp: 1, conc: { start: 64, end: 64 } } - { tp: 2, conc: { start: 32, end: 64 } } @@ -201,6 +206,8 @@ llama: - { tp: 4, conc: { start: 4, end: 64 } } - { tp: 8, conc: { start: 4, end: 32 } } b200-trt: + # fix: add TP=2,4 to B200, just as mi355 has + # B200 can achieve TPS/User >= 30 with larger concurrency till 256 1k1k: - { tp: 1, conc: { start: 128, end: 128 } } - { tp: 2, conc: { start: 64, end: 128 } } diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index 215de86e5..b78e1ecfb 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -27,7 +27,7 @@ echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTIO hf download $MODEL -# # ========= Determine MOE_BACKEND based on ISL, OSL, CONC ========= +# ========= Determine MOE_BACKEND based on ISL, OSL, CONC ========= # Default MOE_BACKEND="TRTLLM" From e2d1a4c123275e20956ca7491e79676af958a63e Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 11:23:13 -0500 Subject: [PATCH 04/17] adding tests --- .github/workflows/benchmark-tmpl.yml | 3 +- utils/flatten_matrix.py | 141 ++++---- utils/test_flatten_matrix.py | 469 +++++++++++++++++++++++++++ 3 files changed, 554 insertions(+), 59 deletions(-) create mode 100644 utils/test_flatten_matrix.py diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index a027bda8b..bcc96983e 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -70,7 +70,8 @@ jobs: fail-fast: false matrix: config: ${{ fromJson(needs.flatten-search-space-matrix.outputs.flattened-matrix) }} - name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.config.tp }} conc${{ matrix.config.conc }}' + + name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}' env: TP: ${{ matrix.config.tp }} diff --git a/utils/flatten_matrix.py b/utils/flatten_matrix.py index a46bce549..7c75058d7 100644 --- a/utils/flatten_matrix.py +++ b/utils/flatten_matrix.py @@ -2,72 +2,97 @@ import os import json -with open('.github/configs/search-space.yml', 'r') as f: - search_space = yaml.safe_load(f) - -seq_len_map = { - '1024': '1k', - '8192': '8k', -} - -runner = os.environ['RUNNER'] -model = os.environ['MODEL'] -isl = os.environ['ISL'] -osl = os.environ['OSL'] -precision = os.environ['PRECISION'] - -# In the workflows, not all model references are the same so do a sort -# of partial matching to map to the search space keys -model_map = { - 'gpt-oss': 'gptoss', - 'llama-3.3-70b-instruct': 'llama', - 'deepseek-r1-0528': 'dsr1', -} - -model_key = None -model_lower = model.lower() -for key, value in model_map.items(): - if key.lower() in model_lower: - model_key = value - break - -if model_key is None: - raise ValueError(f"Model '{model}' is not recognized.") - -seq_len = f"{seq_len_map[isl]}{seq_len_map[osl]}" - -if isinstance(search_space, list): - entries = search_space -else: +def flatten_search_space(config_path, runner, model, isl, osl, precision): + with open(config_path, 'r') as f: + search_space = yaml.safe_load(f) + + seq_len_map = { + '1024': '1k', + '8192': '8k', + } + + model_map = { + 'gpt-oss': 'gptoss', + 'llama-3.3-70b-instruct': 'llama', + 'deepseek-r1-0528': 'dsr1', + } + + model_key = None + model_lower = model.lower() + for key, value in model_map.items(): + if key.lower() in model_lower: + model_key = value + break + + assert model_key, f"model '{model}' not recognized" + + assert seq_len_map.get(isl) and seq_len_map.get(osl), f"either isl or osl not recognized" + seq_len = f"{seq_len_map[isl]}{seq_len_map[osl]}" + + assert search_space.get(model_key, {}).get(precision), f"precision '{precision}' not recognized" + assert search_space.get(model_key, {}).get(precision).get(runner), f"runner '{runner}' not recognized" + entries = search_space.get(model_key, {}).get( - precision, {}).get(runner, {}).get(seq_len, []) + precision).get(runner, {}).get(seq_len, []) -flattened_search_space = [] -for entry in entries: - tp_list = entry['tp'] if isinstance( - entry.get('tp'), list) else [entry.get('tp')] + flattened_search_space = [] + for entry in entries: + assert entry.get('tp'), f"entry malformed, expecting field 'tp'" + tp = entry.get('tp') + assert isinstance(tp, int) or (isinstance(tp, list) and all(isinstance(x, int) for x in tp)), \ + f"entry malformed, expecting field 'tp' to be either an int or list of ints" + + tp_list = entry['tp'] if isinstance(entry['tp'], list) else [entry['tp']] - conc_config = entry.get('conc') - if isinstance(conc_config, dict): + conc_config = entry.get('conc') + + assert conc_config, f"entry malformed, missing field 'conc'" + assert isinstance(conc_config, dict), f"entry malformed, 'conc' must be a dict" + assert 'start' in conc_config, f"entry malformed, 'conc' missing required field 'start'" + assert 'end' in conc_config, f"entry malformed, 'conc' missing required field 'end'" + assert isinstance(conc_config['start'], int), f"entry malformed, 'conc.start' must be an int" + assert isinstance(conc_config['end'], int), f"entry malformed, 'conc.end' must be an int" + assert conc_config['start'] <= conc_config['end'], f"entry malformed, 'conc.start' must be <= 'conc.end'" + start = conc_config['start'] end = conc_config['end'] step_factor = conc_config.get('step', 2) + + if 'step' in conc_config: + assert isinstance(step_factor, int), f"entry malformed, 'conc.step' must be an int" + assert step_factor > 1, f"entry malformed, 'conc.step' must be > 1" + conc_list = [] current = start while current <= end: conc_list.append(current) current *= step_factor - elif isinstance(conc_config, list): - conc_list = conc_config - else: - conc_list = [conc_config] - - for tp_value in tp_list: - for conc_value in conc_list: - new_entry = entry.copy() - new_entry['tp'] = tp_value - new_entry['conc'] = conc_value - flattened_search_space.append(new_entry) - -with open(os.environ['GITHUB_OUTPUT'], 'a') as f: - f.write(f"flattened-matrix={json.dumps(flattened_search_space)}\n") + + for tp_value in tp_list: + for conc_value in conc_list: + new_entry = entry.copy() + new_entry['tp'] = tp_value + new_entry['conc'] = conc_value + flattened_search_space.append(new_entry) + + return flattened_search_space + + +def main(): + config_path = '.github/configs/search-space.yml' + runner = os.environ['RUNNER'] + model = os.environ['MODEL'] + isl = os.environ['ISL'] + osl = os.environ['OSL'] + precision = os.environ['PRECISION'] + + flattened_search_space = flatten_search_space( + config_path, runner, model, isl, osl, precision + ) + + with open(os.environ['GITHUB_OUTPUT'], 'a') as f: + f.write(f"flattened-matrix={json.dumps(flattened_search_space)}\n") + + +if __name__ == '__main__': + main() \ No newline at end of file diff --git a/utils/test_flatten_matrix.py b/utils/test_flatten_matrix.py new file mode 100644 index 000000000..b4f0620ad --- /dev/null +++ b/utils/test_flatten_matrix.py @@ -0,0 +1,469 @@ +import pytest +import yaml +from flatten_matrix import flatten_search_space + + +@pytest.fixture +def minimal_config(): + """Minimal valid config for testing""" + return { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': [2, 4], 'conc': {'start': 4, 'end': 8}} + ] + } + } + }, + 'llama': { + 'fp8': { + 'b200': { + '1k8k': [ + {'tp': 2, 'conc': {'start': 4, 'end': 64, 'step': 4}} + ] + } + } + }, + 'dsr1': { + 'fp4': { + 'b200-trt': { + '8k1k': [ + {'tp': 4, 'conc': {'start': 4, 'end': 32}}, + {'tp': 4, 'ep': 4, 'dp_attention': 'true', 'conc': {'start': 64, 'end': 256}} + ] + } + } + } + } + + +@pytest.fixture +def config_file(minimal_config, tmp_path): + # temp config file + config_path = tmp_path / "search-space.yml" + with open(config_path, 'w') as f: + yaml.dump(minimal_config, f) + return config_path + + +class TestValidCases: + """Test valid input scenarios""" + + def test_single_tp_value(self, config_file): + """Test with single TP value""" + result = flatten_search_space( + config_file, 'b200', 'llama-3.3-70b-instruct', '1024', '8192', 'fp8' + ) + + # Should generate: tp=2, conc=[4, 16, 64] with step=4 + assert len(result) == 3 + assert all(entry['tp'] == 2 for entry in result) + assert [entry['conc'] for entry in result] == [4, 16, 64] + + def test_list_of_tp_values(self, config_file): + """Test with list of TP values""" + result = flatten_search_space( + config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp4' + ) + + # Should generate: tp=[2,4], conc=[4,8] = 2*2 = 4 combinations + assert len(result) == 4 + tp_values = [entry['tp'] for entry in result] + assert tp_values.count(2) == 2 + assert tp_values.count(4) == 2 + + def test_optional_fields_preserved(self, config_file): + """Test that optional fields like ep and dp_attention are preserved""" + result = flatten_search_space( + config_file, 'b200-trt', 'deepseek-r1-0528', '8192', '1024', 'fp4' + ) + + # Second entry should have ep and dp_attention + entries_with_ep = [e for e in result if 'ep' in e] + assert len(entries_with_ep) > 0 + assert all(e['ep'] == 4 for e in entries_with_ep) + + entries_with_dp = [e for e in result if 'dp_attention' in e] + assert len(entries_with_dp) > 0 + assert all(e['dp_attention'] == 'true' for e in entries_with_dp) + + def test_default_step_factor(self, config_file): + """Test that default step factor of 2 is used when not specified""" + result = flatten_search_space( + config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp4' + ) + + # conc: start=4, end=8, default step=2 -> [4, 8] + conc_values = sorted(set(entry['conc'] for entry in result)) + assert conc_values == [4, 8] + + def test_custom_step_factor(self, config_file): + """Test custom step factor""" + result = flatten_search_space( + config_file, 'b200', 'llama-3.3-70b-instruct', '1024', '8192', 'fp8' + ) + + # conc: start=4, end=64, step=4 -> [4, 16, 64] + conc_values = sorted(set(entry['conc'] for entry in result)) + assert conc_values == [4, 16, 64] + + +class TestModelMapping: + """Test model name mapping""" + + def test_gptoss_mapping(self, config_file): + """Test gpt-oss maps to gptoss""" + result = flatten_search_space( + config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp4' + ) + assert len(result) > 0 + + def test_llama_mapping(self, config_file): + """Test llama mapping with case insensitivity""" + result = flatten_search_space( + config_file, 'b200', 'LLAMA-3.3-70B-INSTRUCT', '1024', '8192', 'fp8' + ) + assert len(result) > 0 + + def test_dsr1_mapping(self, config_file): + """Test deepseek-r1 maps to dsr1""" + result = flatten_search_space( + config_file, 'b200-trt', 'deepseek-r1-0528', '8192', '1024', 'fp4' + ) + assert len(result) > 0 + + +class TestInvalidInputs: + """Test error handling for invalid inputs""" + + def test_unrecognized_model(self, config_file): + """Test error for unrecognized model""" + with pytest.raises(AssertionError, match="model .* not recognized"): + flatten_search_space( + config_file, 'h100', 'unknown-model', '1024', '1024', 'fp4' + ) + + def test_invalid_isl(self, config_file): + """Test error for invalid ISL""" + with pytest.raises(AssertionError, match="either isl or osl not recognized"): + flatten_search_space( + config_file, 'h100', 'gpt-oss', '2048', '1024', 'fp4' + ) + + def test_invalid_osl(self, config_file): + """Test error for invalid OSL""" + with pytest.raises(AssertionError, match="either isl or osl not recognized"): + flatten_search_space( + config_file, 'h100', 'gpt-oss', '1024', '4096', 'fp4' + ) + + def test_invalid_precision(self, config_file): + """Test error for invalid precision""" + with pytest.raises(AssertionError, match="precision .* not recognized"): + flatten_search_space( + config_file, 'h100', 'gpt-oss', '1024', '1024', 'fp16' + ) + + def test_invalid_runner(self, config_file): + """Test error for invalid runner""" + with pytest.raises(AssertionError, match="runner .* not recognized"): + flatten_search_space( + config_file, 'a100', 'gpt-oss', '1024', '1024', 'fp4' + ) + + +class TestMalformedEntries: + """Test validation of malformed config entries""" + + def test_missing_tp_field(self, tmp_path): + """Test error when tp field is missing""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'conc': {'start': 4, 'end': 8}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="entry malformed, expecting field 'tp'"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_tp_wrong_type(self, tmp_path): + """Test error when tp is wrong type""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 'invalid', 'conc': {'start': 4, 'end': 8}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="expecting field 'tp' to be either an int or list of ints"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_tp_list_with_non_ints(self, tmp_path): + """Test error when tp list contains non-integers""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': [2, 'four', 8], 'conc': {'start': 4, 'end': 8}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="expecting field 'tp' to be either an int or list of ints"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_missing_conc_field(self, tmp_path): + """Test error when conc field is missing""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="entry malformed, missing field 'conc'"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_conc_not_dict(self, tmp_path): + """Test error when conc is not a dict""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': [4, 8, 16]} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc' must be a dict"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_conc_missing_start(self, tmp_path): + """Test error when conc.start is missing""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'end': 8}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc' missing required field 'start'"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_conc_missing_end(self, tmp_path): + """Test error when conc.end is missing""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'start': 4}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc' missing required field 'end'"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_conc_start_not_int(self, tmp_path): + """Test error when conc.start is not an int""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'start': '4', 'end': 8}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc.start' must be an int"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_conc_end_not_int(self, tmp_path): + """Test error when conc.end is not an int""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'start': 4, 'end': '8'}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc.end' must be an int"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_conc_start_greater_than_end(self, tmp_path): + """Test error when conc.start > conc.end""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'start': 16, 'end': 4}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc.start' must be <= 'conc.end'"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_step_not_int(self, tmp_path): + """Test error when step is not an int""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'start': 4, 'end': 16, 'step': '2'}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc.step' must be an int"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + def test_step_not_greater_than_one(self, tmp_path): + """Test error when step <= 1""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 2, 'conc': {'start': 4, 'end': 16, 'step': 1}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + with pytest.raises(AssertionError, match="'conc.step' must be > 1"): + flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + +class TestConcurrencyGeneration: + """Test concurrency value generation logic""" + + def test_geometric_progression(self, tmp_path): + """Test that concurrency values follow geometric progression""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 1, 'conc': {'start': 4, 'end': 64, 'step': 2}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + result = flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + conc_values = [entry['conc'] for entry in result] + assert conc_values == [4, 8, 16, 32, 64] + + def test_single_conc_value(self, tmp_path): + """Test when start equals end""" + config = { + 'gptoss': { + 'fp4': { + 'h100': { + '1k1k': [ + {'tp': 1, 'conc': {'start': 64, 'end': 64}} + ] + } + } + } + } + config_path = tmp_path / "config.yml" + with open(config_path, 'w') as f: + yaml.dump(config, f) + + result = flatten_search_space(config_path, 'h100', 'gpt-oss', '1024', '1024', 'fp4') + + assert len(result) == 1 + assert result[0]['conc'] == 64 + + +if __name__ == '__main__': + pytest.main([__file__, '-v']) \ No newline at end of file From 731300ced6295356221960a96f7db63f674ec7fe Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 11:24:38 -0500 Subject: [PATCH 05/17] removing extraneous whitespace --- benchmarks/dsr1_fp8_h200_trt_slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/dsr1_fp8_h200_trt_slurm.sh b/benchmarks/dsr1_fp8_h200_trt_slurm.sh index 3552fac78..5dfdf8617 100644 --- a/benchmarks/dsr1_fp8_h200_trt_slurm.sh +++ b/benchmarks/dsr1_fp8_h200_trt_slurm.sh @@ -32,7 +32,7 @@ cat > $EXTRA_CONFIG_FILE << EOF cuda_graph_config: enable_padding: true max_batch_size: 128 - enable_attention_dp: $DP_ATTENTION +enable_attention_dp: $DP_ATTENTION print_iter_log: true kv_cache_config: dtype: fp8 From d44695e2795c08e8dbfa3f8b5f98302b98645302 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 11:49:17 -0500 Subject: [PATCH 06/17] addings docs for new config --- .github/configs/CONFIGS.md | 80 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) create mode 100644 .github/configs/CONFIGS.md diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md new file mode 100644 index 000000000..35b804804 --- /dev/null +++ b/.github/configs/CONFIGS.md @@ -0,0 +1,80 @@ +# Search Space Configuration + +This file defines which benchmark configurations to run for each model, GPU, and precision combination. + +## Quick Start + +Add a new configuration by following this pattern: +```yaml +model_name: + precision: + gpu_type: + sequence_length: + - { tp: [1, 2, 4], conc: { start: 4, end: 64 } } +``` + +## Field Definitions + +### Required Fields + +- **`tp`**: Tensor Parallelism (number of GPUs) + - Single value: `tp: 4` + - Multiple values: `tp: [2, 4, 8]` + +- **`conc`**: Concurrency (number of simultaneous requests) + - `start`: First value to test + - `end`: Last value to test + - `step`: Multiplier (default: 2) + - Example: `{start: 4, end: 64}` → tests [4, 8, 16, 32, 64] + +### Optional Fields + +- **`ep`**: Expert Parallelism for MoE models (default: 1) + +- **`dp_attention`**: Data Parallel Attention (default: `"false"`) + +## Examples + +### Basic configuration +```yaml +gptoss: + fp4: + h100: + 1k1k: # 1024 input, 1024 output + - { tp: [2, 4, 8], conc: { start: 4, end: 64 } } +``` +This tests 15 combinations: 3 TP values × 5 concurrency values + +### Configuration with optional fields +```yaml +dsr1: + fp4: + b200-trt: + 1k1k: + - { tp: 4, ep: 4, dp_attention: "true", conc: { start: 256, end: 256 } } +``` + +### Custom step factor +```yaml +llama: + fp8: + b200: + 1k8k: + - { tp: 2, conc: { start: 4, end: 64, step: 4 } } +``` +This tests [4, 16, 64] (multiplies by 4 instead of default 2) + +## Key Points + +1. **Models**: `gptoss`, `llama`, `dsr1` +2. **Precisions**: `fp4`, `fp8` +3. **Sequence lengths**: `1k1k`, `1k8k`, `8k1k` (input×output) +4. Each entry expands to test all combinations of TP and concurrency values +5. Use `-trt` suffix for TensorRT-optimized hardware configs + +## Testing Your Changes + +Run the flattening script to validate: +```bash +python utils/flatten_matrix.py +``` \ No newline at end of file From a148f00bf74cf6912dae828ba6bacc6c055143cf Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 12:10:57 -0500 Subject: [PATCH 07/17] adding summarization and result changes --- .github/configs/CONFIGS.md | 2 +- .../workflows/benchmark-multinode-tmpl.yml | 4 +- .github/workflows/benchmark-tmpl.yml | 4 +- utils/count_num_jobs.py | 37 +++++++++++++++++++ utils/process_result.py | 14 ++++--- utils/summarize.py | 8 ++-- 6 files changed, 57 insertions(+), 12 deletions(-) create mode 100644 utils/count_num_jobs.py diff --git a/.github/configs/CONFIGS.md b/.github/configs/CONFIGS.md index 35b804804..9021ec83e 100644 --- a/.github/configs/CONFIGS.md +++ b/.github/configs/CONFIGS.md @@ -70,7 +70,7 @@ This tests [4, 16, 64] (multiplies by 4 instead of default 2) 2. **Precisions**: `fp4`, `fp8` 3. **Sequence lengths**: `1k1k`, `1k8k`, `8k1k` (input×output) 4. Each entry expands to test all combinations of TP and concurrency values -5. Use `-trt` suffix for TensorRT-optimized hardware configs +5. There are comments throughout the yaml that were ported over from bash scripts describing what parallelism settings should be set depending on concurrency -- keep an eye out for those. ## Testing Your Changes diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 07f5b876d..b5bcc5817 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -84,6 +84,8 @@ jobs: exit 1 fi + # NOTE: https://github.com/InferenceMAX/InferenceMAX/pull/111 adds EP_SIZE and DP_ATTENTION parsing to the process_results.py script + # but it is not yet implemented for GB200 multi-node, therefore just default to: 1 "false" - name: Process results run: | # Process each result file @@ -93,7 +95,7 @@ jobs: # Extract GPU count from filename for tp_size calculation gpus=$(echo "$result_file" | sed "s/.*_gpus\([0-9]*\)\.json/\1/") if [ -n "$gpus" ]; then - python3 utils/process_result.py ${{ inputs.runner }} $gpus ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE + python3 utils/process_result.py ${{ inputs.runner }} $gpus 1 "false" ${result_file%.json} $FRAMEWORK $PRECISION $MTP_MODE fi fi done diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index bcc96983e..901871254 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -70,7 +70,7 @@ jobs: fail-fast: false matrix: config: ${{ fromJson(needs.flatten-search-space-matrix.outputs.flattened-matrix) }} - + name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} tp${{ matrix.config.tp }} ep${{ matrix.config.ep || 1 }} conc${{ matrix.config.conc }}' env: @@ -153,7 +153,7 @@ jobs: - name: Process result run: | - python3 utils/process_result.py ${{ inputs.runner }} $TP $RESULT_FILENAME $FRAMEWORK $PRECISION + python3 utils/process_result.py ${{ inputs.runner }} $TP $EP_SIZE $DP_ATTENTION $RESULT_FILENAME $FRAMEWORK $PRECISION - name: Upload result uses: actions/upload-artifact@v4 diff --git a/utils/count_num_jobs.py b/utils/count_num_jobs.py new file mode 100644 index 000000000..776fd2a65 --- /dev/null +++ b/utils/count_num_jobs.py @@ -0,0 +1,37 @@ +import yaml +from collections import defaultdict + +with open('.github/configs/search-space.yml', 'r') as f: + data = yaml.safe_load(f) + +gpu_totals = defaultdict(int) +overall_total = 0 + +for model in data.values(): + for precision in model.values(): + for gpu, runner_data in precision.items(): + for seq_len in runner_data.values(): + for entry in seq_len: + # Count TP values + tp_list = entry['tp'] if isinstance(entry['tp'], list) else [entry['tp']] + tp_count = len(tp_list) + + # Count CONC values + conc = entry['conc'] + start, end = conc['start'], conc['end'] + step = conc.get('step', 2) + + conc_count = 0 + current = start + while current <= end: + conc_count += 1 + current *= step + + combo_count = tp_count * conc_count + gpu_totals[gpu] += combo_count + overall_total += combo_count + +print("Breakdown by GPU:") +for gpu in sorted(gpu_totals.keys()): + print(f" {gpu}: {gpu_totals[gpu]}") +print(f"\nTotal combinations: {overall_total}") \ No newline at end of file diff --git a/utils/process_result.py b/utils/process_result.py index 89c4aa7b3..a59d1f7f3 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -5,9 +5,11 @@ hw = sys.argv[1] tp_size = int(sys.argv[2]) -result_filename = sys.argv[3] -framework = sys.argv[4] -precision = sys.argv[5] +ep_size = int(sys.argv[3]) +dp_attention = sys.argv[4] +result_filename = sys.argv[5] +framework = sys.argv[6] +precision = sys.argv[7] with open(f'{result_filename}.json') as f: bmk_result = json.load(f) @@ -15,7 +17,9 @@ data = { 'hw': hw, 'tp': tp_size, + 'ep': ep_size, 'conc': int(bmk_result['max_concurrency']), + 'dp_attention': dp_attention, # true or false 'model': bmk_result['model_id'], 'framework': framework, 'precision': precision, @@ -23,8 +27,8 @@ 'output_tput_per_gpu': float(bmk_result['output_throughput']) / tp_size } -if len(sys.argv) == 7: # MTP - data['mtp'] = sys.argv[6] +if len(sys.argv) == 9: # MTP + data['mtp'] = sys.argv[8] for key, value in bmk_result.items(): if key.endswith('ms'): diff --git a/utils/summarize.py b/utils/summarize.py index 1f78caf9c..546a13757 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -9,11 +9,11 @@ with open(result_path) as f: result = json.load(f) results.append(result) -results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['conc'])) +results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['ep'], r['conc'])) summary_header = f'''\ -| Hardware | Framework | Precision | TP | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | -| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ +| Hardware | Framework | Precision | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | +| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' print(summary_header) @@ -25,7 +25,9 @@ f"| {framework.upper()} " f"| {precision.upper()} " f"| {result['tp']} " + f"| {result['ep']} " f"| {result['conc']} " + f"| {result['dp_attention']} " f"| {(result['median_ttft'] * 1000):.4f} " f"| {(result['median_tpot'] * 1000):.4f} " f"| {result['median_e2el']:.4f} " From 3540e4cc4cbd428cbc371200d91bca77c8dd3196 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 12:14:13 -0500 Subject: [PATCH 08/17] TEST update workflow for testing --- .github/workflows/dsr1-tmpl.yml | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml index 64dfe4ec4..c8b7fd64d 100644 --- a/.github/workflows/dsr1-tmpl.yml +++ b/.github/workflows/dsr1-tmpl.yml @@ -40,21 +40,21 @@ on: default: false jobs: - bmk-h200-fp8: - if: ${{ inputs.use_h200 }} - uses: ./.github/workflows/benchmark-tmpl.yml - secrets: inherit - with: - runner: h200 - image: 'lmsysorg/sglang:v0.5.2rc2-cu126' - model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'sglang' - precision: 'fp8' - exp-name: ${{ inputs.exp-name }} - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ inputs.max-model-len }} - random-range-ratio: ${{ inputs.random-range-ratio }} + # bmk-h200-fp8: + # if: ${{ inputs.use_h200 }} + # uses: ./.github/workflows/benchmark-tmpl.yml + # secrets: inherit + # with: + # runner: h200 + # image: 'lmsysorg/sglang:v0.5.2rc2-cu126' + # model: 'deepseek-ai/DeepSeek-R1-0528' + # framework: 'sglang' + # precision: 'fp8' + # exp-name: ${{ inputs.exp-name }} + # isl: ${{ inputs.isl }} + # osl: ${{ inputs.osl }} + # max-model-len: ${{ inputs.max-model-len }} + # random-range-ratio: ${{ inputs.random-range-ratio }} bmk-h200-trt-fp8: if: ${{ inputs.use_h200 }} From 01fc80ed16f4d7eff1bc0d025c64898b04aa91d2 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 12:15:33 -0500 Subject: [PATCH 09/17] TEST update workflow for testing --- .github/workflows/full-sweep-tmpl.yml | 92 +++++++++++++-------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml index b086460df..e739c9aa4 100644 --- a/.github/workflows/full-sweep-tmpl.yml +++ b/.github/workflows/full-sweep-tmpl.yml @@ -37,30 +37,30 @@ on: default: false jobs: - _70b-1k1k: - if: ${{ inputs.run_1k1k }} - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_1k1k' - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} + # _70b-1k1k: + # if: ${{ inputs.run_1k1k }} + # uses: ./.github/workflows/70b-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b_1k1k' + # isl: 1024 + # osl: 1024 + # max-model-len: 2048 + # random-range-ratio: 0.8 + # use_h100: ${{ inputs.use_h100 }} + # use_h200: ${{ inputs.use_h200 }} + # use_b200: ${{ inputs.use_b200 }} + # use_mi300x: ${{ inputs.use_mi300x }} + # use_mi325x: ${{ inputs.use_mi325x }} + # use_mi355x: ${{ inputs.use_mi355x }} - collect-70b-1k1k-results: - needs: _70b-1k1k - if: ${{ inputs.run_1k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: '70b_1k1k' + # collect-70b-1k1k-results: + # needs: _70b-1k1k + # if: ${{ inputs.run_1k1k && always() }} + # uses: ./.github/workflows/collect-results.yml + # secrets: inherit + # with: + # exp-name: '70b_1k1k' dsr1-1k1k: if: ${{ inputs.run_1k1k }} @@ -87,30 +87,30 @@ jobs: with: exp-name: 'dsr1_1k1k' - gptoss-1k1k: - if: ${{ inputs.run_1k1k }} - uses: ./.github/workflows/gptoss-tmpl.yml - secrets: inherit - with: - exp-name: 'gptoss_1k1k' - isl: 1024 - osl: 1024 - max-model-len: 2048 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} + # gptoss-1k1k: + # if: ${{ inputs.run_1k1k }} + # uses: ./.github/workflows/gptoss-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'gptoss_1k1k' + # isl: 1024 + # osl: 1024 + # max-model-len: 2048 + # random-range-ratio: 0.8 + # use_h100: ${{ inputs.use_h100 }} + # use_h200: ${{ inputs.use_h200 }} + # use_b200: ${{ inputs.use_b200 }} + # use_mi300x: ${{ inputs.use_mi300x }} + # use_mi325x: ${{ inputs.use_mi325x }} + # use_mi355x: ${{ inputs.use_mi355x }} - collect-gptoss-1k1k-results: - needs: gptoss-1k1k - if: ${{ inputs.run_1k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'gptoss_1k1k' + # collect-gptoss-1k1k-results: + # needs: gptoss-1k1k + # if: ${{ inputs.run_1k1k && always() }} + # uses: ./.github/workflows/collect-results.yml + # secrets: inherit + # with: + # exp-name: 'gptoss_1k1k' _70b-8k1k: if: ${{ inputs.run_8k1k }} From de36f9a8ae43fa0286794cd2a8c623cee009b225 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 12:36:42 -0500 Subject: [PATCH 10/17] fix incorrect formatting for summary --- utils/summarize.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/summarize.py b/utils/summarize.py index 546a13757..de8863c78 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -12,7 +12,7 @@ results.sort(key=lambda r: (r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r['tp'], r['ep'], r['conc'])) summary_header = f'''\ -| Hardware | Framework | Precision | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | +| Hardware | Framework | Precision | TP | EP | Conc | DP Attention | TTFT (ms) | TPOT (ms) | E2EL (s) | TPUT per GPU | | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' print(summary_header) From 010866713884604e79874aaf0ad9f36e5afe669d Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 12:38:14 -0500 Subject: [PATCH 11/17] TEST update workflow for testing --- .github/workflows/full-sweep-tmpl.yml | 92 +++++++++++++-------------- 1 file changed, 46 insertions(+), 46 deletions(-) diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml index e739c9aa4..a612256d4 100644 --- a/.github/workflows/full-sweep-tmpl.yml +++ b/.github/workflows/full-sweep-tmpl.yml @@ -112,30 +112,30 @@ jobs: # with: # exp-name: 'gptoss_1k1k' - _70b-8k1k: - if: ${{ inputs.run_8k1k }} - uses: ./.github/workflows/70b-tmpl.yml - secrets: inherit - with: - exp-name: '70b_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} + # _70b-8k1k: + # if: ${{ inputs.run_8k1k }} + # uses: ./.github/workflows/70b-tmpl.yml + # secrets: inherit + # with: + # exp-name: '70b_8k1k' + # isl: 8192 + # osl: 1024 + # max-model-len: 9216 + # random-range-ratio: 0.8 + # use_h100: ${{ inputs.use_h100 }} + # use_h200: ${{ inputs.use_h200 }} + # use_b200: ${{ inputs.use_b200 }} + # use_mi300x: ${{ inputs.use_mi300x }} + # use_mi325x: ${{ inputs.use_mi325x }} + # use_mi355x: ${{ inputs.use_mi355x }} - collect-70b-8k1k-results: - needs: _70b-8k1k - if: ${{ inputs.run_8k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: '70b_8k1k' + # collect-70b-8k1k-results: + # needs: _70b-8k1k + # if: ${{ inputs.run_8k1k && always() }} + # uses: ./.github/workflows/collect-results.yml + # secrets: inherit + # with: + # exp-name: '70b_8k1k' dsr1-8k1k: if: ${{ inputs.run_8k1k }} @@ -162,30 +162,30 @@ jobs: with: exp-name: 'dsr1_8k1k' - gptoss-8k1k: - if: ${{ inputs.run_8k1k }} - uses: ./.github/workflows/gptoss-tmpl.yml - secrets: inherit - with: - exp-name: 'gptoss_8k1k' - isl: 8192 - osl: 1024 - max-model-len: 9216 - random-range-ratio: 0.8 - use_h100: ${{ inputs.use_h100 }} - use_h200: ${{ inputs.use_h200 }} - use_b200: ${{ inputs.use_b200 }} - use_mi300x: ${{ inputs.use_mi300x }} - use_mi325x: ${{ inputs.use_mi325x }} - use_mi355x: ${{ inputs.use_mi355x }} + # gptoss-8k1k: + # if: ${{ inputs.run_8k1k }} + # uses: ./.github/workflows/gptoss-tmpl.yml + # secrets: inherit + # with: + # exp-name: 'gptoss_8k1k' + # isl: 8192 + # osl: 1024 + # max-model-len: 9216 + # random-range-ratio: 0.8 + # use_h100: ${{ inputs.use_h100 }} + # use_h200: ${{ inputs.use_h200 }} + # use_b200: ${{ inputs.use_b200 }} + # use_mi300x: ${{ inputs.use_mi300x }} + # use_mi325x: ${{ inputs.use_mi325x }} + # use_mi355x: ${{ inputs.use_mi355x }} - collect-gptoss-8k1k-results: - needs: gptoss-8k1k - if: ${{ inputs.run_8k1k && always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: 'gptoss_8k1k' + # collect-gptoss-8k1k-results: + # needs: gptoss-8k1k + # if: ${{ inputs.run_8k1k && always() }} + # uses: ./.github/workflows/collect-results.yml + # secrets: inherit + # with: + # exp-name: 'gptoss_8k1k' _70b-1k8k: if: ${{ inputs.run_1k8k }} From 2f015cc3988c187daa85a2e4a64d5140cae5fdbb Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 12:42:24 -0500 Subject: [PATCH 12/17] timeout minutes for flatten job --- .github/workflows/benchmark-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 901871254..f6460f930 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -64,7 +64,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} needs: flatten-search-space-matrix - timeout-minutes: 180 + timeout-minutes: 2 strategy: fail-fast: false From 96c128e1dc5ef1fd1b471471d6cce143b12be31a Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 12:51:53 -0500 Subject: [PATCH 13/17] remove debug --- .github/workflows/benchmark-tmpl.yml | 3 --- 1 file changed, 3 deletions(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index f6460f930..77df1aee8 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -80,9 +80,6 @@ jobs: DP_ATTENTION: ${{ matrix.config.dp_attention || 'false' }} steps: - - name: debug - run: echo "${{ matrix.config }}" - - name: Resource cleanup run: | if command -v docker >/dev/null 2>&1 && docker info >/dev/null 2>&1; then From 4aadd32f7d8450df457fd74c7ca2c944637aea3c Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 13:05:59 -0500 Subject: [PATCH 14/17] remove debug --- .github/workflows/full-sweep-tmpl.yml | 184 +++++++++++++------------- 1 file changed, 92 insertions(+), 92 deletions(-) diff --git a/.github/workflows/full-sweep-tmpl.yml b/.github/workflows/full-sweep-tmpl.yml index a612256d4..b086460df 100644 --- a/.github/workflows/full-sweep-tmpl.yml +++ b/.github/workflows/full-sweep-tmpl.yml @@ -37,30 +37,30 @@ on: default: false jobs: - # _70b-1k1k: - # if: ${{ inputs.run_1k1k }} - # uses: ./.github/workflows/70b-tmpl.yml - # secrets: inherit - # with: - # exp-name: '70b_1k1k' - # isl: 1024 - # osl: 1024 - # max-model-len: 2048 - # random-range-ratio: 0.8 - # use_h100: ${{ inputs.use_h100 }} - # use_h200: ${{ inputs.use_h200 }} - # use_b200: ${{ inputs.use_b200 }} - # use_mi300x: ${{ inputs.use_mi300x }} - # use_mi325x: ${{ inputs.use_mi325x }} - # use_mi355x: ${{ inputs.use_mi355x }} + _70b-1k1k: + if: ${{ inputs.run_1k1k }} + uses: ./.github/workflows/70b-tmpl.yml + secrets: inherit + with: + exp-name: '70b_1k1k' + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + use_h100: ${{ inputs.use_h100 }} + use_h200: ${{ inputs.use_h200 }} + use_b200: ${{ inputs.use_b200 }} + use_mi300x: ${{ inputs.use_mi300x }} + use_mi325x: ${{ inputs.use_mi325x }} + use_mi355x: ${{ inputs.use_mi355x }} - # collect-70b-1k1k-results: - # needs: _70b-1k1k - # if: ${{ inputs.run_1k1k && always() }} - # uses: ./.github/workflows/collect-results.yml - # secrets: inherit - # with: - # exp-name: '70b_1k1k' + collect-70b-1k1k-results: + needs: _70b-1k1k + if: ${{ inputs.run_1k1k && always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: '70b_1k1k' dsr1-1k1k: if: ${{ inputs.run_1k1k }} @@ -87,55 +87,55 @@ jobs: with: exp-name: 'dsr1_1k1k' - # gptoss-1k1k: - # if: ${{ inputs.run_1k1k }} - # uses: ./.github/workflows/gptoss-tmpl.yml - # secrets: inherit - # with: - # exp-name: 'gptoss_1k1k' - # isl: 1024 - # osl: 1024 - # max-model-len: 2048 - # random-range-ratio: 0.8 - # use_h100: ${{ inputs.use_h100 }} - # use_h200: ${{ inputs.use_h200 }} - # use_b200: ${{ inputs.use_b200 }} - # use_mi300x: ${{ inputs.use_mi300x }} - # use_mi325x: ${{ inputs.use_mi325x }} - # use_mi355x: ${{ inputs.use_mi355x }} + gptoss-1k1k: + if: ${{ inputs.run_1k1k }} + uses: ./.github/workflows/gptoss-tmpl.yml + secrets: inherit + with: + exp-name: 'gptoss_1k1k' + isl: 1024 + osl: 1024 + max-model-len: 2048 + random-range-ratio: 0.8 + use_h100: ${{ inputs.use_h100 }} + use_h200: ${{ inputs.use_h200 }} + use_b200: ${{ inputs.use_b200 }} + use_mi300x: ${{ inputs.use_mi300x }} + use_mi325x: ${{ inputs.use_mi325x }} + use_mi355x: ${{ inputs.use_mi355x }} - # collect-gptoss-1k1k-results: - # needs: gptoss-1k1k - # if: ${{ inputs.run_1k1k && always() }} - # uses: ./.github/workflows/collect-results.yml - # secrets: inherit - # with: - # exp-name: 'gptoss_1k1k' + collect-gptoss-1k1k-results: + needs: gptoss-1k1k + if: ${{ inputs.run_1k1k && always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: 'gptoss_1k1k' - # _70b-8k1k: - # if: ${{ inputs.run_8k1k }} - # uses: ./.github/workflows/70b-tmpl.yml - # secrets: inherit - # with: - # exp-name: '70b_8k1k' - # isl: 8192 - # osl: 1024 - # max-model-len: 9216 - # random-range-ratio: 0.8 - # use_h100: ${{ inputs.use_h100 }} - # use_h200: ${{ inputs.use_h200 }} - # use_b200: ${{ inputs.use_b200 }} - # use_mi300x: ${{ inputs.use_mi300x }} - # use_mi325x: ${{ inputs.use_mi325x }} - # use_mi355x: ${{ inputs.use_mi355x }} + _70b-8k1k: + if: ${{ inputs.run_8k1k }} + uses: ./.github/workflows/70b-tmpl.yml + secrets: inherit + with: + exp-name: '70b_8k1k' + isl: 8192 + osl: 1024 + max-model-len: 9216 + random-range-ratio: 0.8 + use_h100: ${{ inputs.use_h100 }} + use_h200: ${{ inputs.use_h200 }} + use_b200: ${{ inputs.use_b200 }} + use_mi300x: ${{ inputs.use_mi300x }} + use_mi325x: ${{ inputs.use_mi325x }} + use_mi355x: ${{ inputs.use_mi355x }} - # collect-70b-8k1k-results: - # needs: _70b-8k1k - # if: ${{ inputs.run_8k1k && always() }} - # uses: ./.github/workflows/collect-results.yml - # secrets: inherit - # with: - # exp-name: '70b_8k1k' + collect-70b-8k1k-results: + needs: _70b-8k1k + if: ${{ inputs.run_8k1k && always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: '70b_8k1k' dsr1-8k1k: if: ${{ inputs.run_8k1k }} @@ -162,30 +162,30 @@ jobs: with: exp-name: 'dsr1_8k1k' - # gptoss-8k1k: - # if: ${{ inputs.run_8k1k }} - # uses: ./.github/workflows/gptoss-tmpl.yml - # secrets: inherit - # with: - # exp-name: 'gptoss_8k1k' - # isl: 8192 - # osl: 1024 - # max-model-len: 9216 - # random-range-ratio: 0.8 - # use_h100: ${{ inputs.use_h100 }} - # use_h200: ${{ inputs.use_h200 }} - # use_b200: ${{ inputs.use_b200 }} - # use_mi300x: ${{ inputs.use_mi300x }} - # use_mi325x: ${{ inputs.use_mi325x }} - # use_mi355x: ${{ inputs.use_mi355x }} + gptoss-8k1k: + if: ${{ inputs.run_8k1k }} + uses: ./.github/workflows/gptoss-tmpl.yml + secrets: inherit + with: + exp-name: 'gptoss_8k1k' + isl: 8192 + osl: 1024 + max-model-len: 9216 + random-range-ratio: 0.8 + use_h100: ${{ inputs.use_h100 }} + use_h200: ${{ inputs.use_h200 }} + use_b200: ${{ inputs.use_b200 }} + use_mi300x: ${{ inputs.use_mi300x }} + use_mi325x: ${{ inputs.use_mi325x }} + use_mi355x: ${{ inputs.use_mi355x }} - # collect-gptoss-8k1k-results: - # needs: gptoss-8k1k - # if: ${{ inputs.run_8k1k && always() }} - # uses: ./.github/workflows/collect-results.yml - # secrets: inherit - # with: - # exp-name: 'gptoss_8k1k' + collect-gptoss-8k1k-results: + needs: gptoss-8k1k + if: ${{ inputs.run_8k1k && always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: 'gptoss_8k1k' _70b-1k8k: if: ${{ inputs.run_1k8k }} From 964c5c2367dd5ba4e6dd98b9f7fb020fec754760 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 13:10:10 -0500 Subject: [PATCH 15/17] add timeout to correct job --- .github/workflows/benchmark-tmpl.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 77df1aee8..ee58f4bb7 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -49,6 +49,7 @@ env: jobs: flatten-search-space-matrix: + timeout-minutes: 2 runs-on: ubuntu-latest outputs: flattened-matrix: ${{ steps.flatten.outputs.flattened-matrix }} @@ -64,7 +65,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} needs: flatten-search-space-matrix - timeout-minutes: 2 + timeout-minutes: 180 strategy: fail-fast: false From b0ff2bad6de3c66556547b0c0d6bb7cb7d00e934 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 13:17:49 -0500 Subject: [PATCH 16/17] add more descriptive name to flattening matrix --- .github/workflows/benchmark-tmpl.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index ee58f4bb7..2eaaa7edd 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -49,7 +49,8 @@ env: jobs: flatten-search-space-matrix: - timeout-minutes: 2 + name: get search space ${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.precision }} + timeout-minutes: 180 runs-on: ubuntu-latest outputs: flattened-matrix: ${{ steps.flatten.outputs.flattened-matrix }} From 488315000f114c5c6ddaf7bd263e1a91a0eb8070 Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Fri, 17 Oct 2025 13:26:01 -0500 Subject: [PATCH 17/17] fix moe backend --- benchmarks/dsr1_fp4_b200_trt_slurm.sh | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/benchmarks/dsr1_fp4_b200_trt_slurm.sh b/benchmarks/dsr1_fp4_b200_trt_slurm.sh index b78e1ecfb..d13584078 100644 --- a/benchmarks/dsr1_fp4_b200_trt_slurm.sh +++ b/benchmarks/dsr1_fp4_b200_trt_slurm.sh @@ -18,19 +18,14 @@ echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -# Default backend is TRTLLM -if [[ "MOE_BACKEND" == "NONE" ]]; then - MOE_BACKEND="TRTLLM" -fi +# Default +MOE_BACKEND="TRTLLM" echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL, EP_SIZE: $EP_SIZE, DP_ATTENTION=$DP_ATTENTION, MOE_BACKEND=$MOE_BACKEND" hf download $MODEL # ========= Determine MOE_BACKEND based on ISL, OSL, CONC ========= -# Default -MOE_BACKEND="TRTLLM" - if [[ "$TP" == "4" ]]; then if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then if [[ $CONC -ge 256 ]]; then