diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 938011d47..a2674153a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -5,6 +5,7 @@ dsr1-fp4-mi355x-sglang: runner: mi355x precision: fp4 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -27,6 +28,7 @@ dsr1-fp8-mi300x-sglang: runner: mi300x precision: fp8 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -48,6 +50,7 @@ dsr1-fp8-mi325x-sglang: runner: mi325x precision: fp8 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -69,6 +72,7 @@ dsr1-fp8-mi355x-sglang: runner: mi355x precision: fp8 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -90,6 +94,7 @@ gptoss-fp4-mi300x-vllm: runner: mi300x precision: fp4 framework: vllm + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -120,6 +125,7 @@ gptoss-fp4-mi325x-vllm: runner: mi325x precision: fp4 framework: vllm + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -150,6 +156,7 @@ gptoss-fp4-mi355x-vllm: runner: mi355x precision: fp4 framework: vllm + multinode: false seq-len-configs: - isl: 1024 osl: 1024 diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a3d848475..c4370f483 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -5,6 +5,7 @@ dsr1-fp4-b200-sglang: runner: b200 precision: fp4 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -29,6 +30,7 @@ dsr1-fp4-b200-trt: runner: b200-trt precision: fp4 framework: trt + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -79,6 +81,7 @@ dsr1-fp8-b200-sglang: runner: b200 precision: fp8 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -100,6 +103,7 @@ dsr1-fp8-b200-trt: runner: b200-trt precision: fp8 framework: trt + multinode: false seq-len-configs: # For all sequence lengths, EP=TP - isl: 1024 @@ -126,6 +130,7 @@ dsr1-fp8-h200-sglang: runner: h200 precision: fp8 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -147,6 +152,7 @@ dsr1-fp8-h200-trt: runner: h200 precision: fp8 framework: trt + multinode: false # For all sequence lengths, EP=TP seq-len-configs: - isl: 1024 @@ -173,7 +179,8 @@ gptoss-fp4-b200-trt: runner: b200-trt precision: fp4 framework: trt - # Enable DP_ATTENTION for conc >= 32 + multinode: false + # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true seq-len-configs: - isl: 1024 osl: 1024 @@ -208,6 +215,7 @@ gptoss-fp4-b200-vllm: runner: b200 precision: fp4 framework: vllm + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -238,6 +246,7 @@ gptoss-fp4-h100-vllm: runner: h100 precision: fp4 framework: vllm + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -265,6 +274,7 @@ gptoss-fp4-h200-trt: runner: h200 precision: fp4 framework: trt + multinode: false # For all sequence lengths, EP=TP, DP_ATTENTION=false seq-len-configs: - isl: 1024 @@ -296,6 +306,7 @@ gptoss-fp4-h200-vllm: runner: h200 precision: fp4 framework: vllm + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -318,3 +329,545 @@ gptoss-fp4-h200-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 32 } + +dsr1-fp4-gb200-dynamo-trt: + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 + model: deepseek-r1-fp4 + model-prefix: dsr1 + runner: gb200 + precision: fp4 + framework: dynamo-trt + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + # NOTE: Prefill tp and ep are always 4 because each GB200 node has 4 GPUs and + # ctx_tp_size is hardcoded to 4 in launch_gb200-nv.sh. Decode tp/ep matches gen_tp_size. + # For 1k/1k: prefill batch-size=4, max-num-tokens=4608 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8, 16, 36 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_MAX_NUM_TOKENS=128" + - "DECODE_MAX_BATCH_SIZE=32" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=3" + + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - spec-decoding: "mtp" + conc-list: [ 512, 1075 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=3" + + - spec-decoding: "mtp" + conc-list: [ 2150 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=1" + + - spec-decoding: "mtp" + conc-list: [ 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.6" + - "DECODE_MTP_SIZE=3" + + - spec-decoding: "mtp" + conc-list: [ 2252 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=512" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=1" + + # Non-MTP configurations (default spec_decoding="none") + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 141 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_MAX_NUM_TOKENS=128" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=0" + + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - conc-list: [ 1075 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=32" + - "DECODE_MAX_BATCH_SIZE=32" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=0" + + - conc-list: [ 1075 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" + + - conc-list: [ 2048, 4300 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" + + - conc-list: [ 4300 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=512" + - "DECODE_MAX_BATCH_SIZE=512" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=0" + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations (spec_decoding="mtp") + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + # For 8k/1k: prefill batch-size=1, max-num-tokens=8448 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8, 18 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=3" + + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - spec-decoding: "mtp" + conc-list: [ 128, 269 ] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=32" + - "DECODE_MAX_BATCH_SIZE=8" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=3" + + - spec-decoding: "mtp" + conc-list: [ 538 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=3" + + - spec-decoding: "mtp" + conc-list: [ 1075 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=2" + + - spec-decoding: "mtp" + conc-list: [ 2150 ] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=512" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=1" + + # Non-MTP configurations (default spec_decoding="none") + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + - conc-list: [ 1, 2, 4, 8, 16, 34 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_MAX_NUM_TOKENS=32" + - "DECODE_MAX_BATCH_SIZE=32" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=0" + + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - conc-list: [ 256, 538 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=16" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=0" + + - conc-list: [ 1075 ] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" + + - conc-list: [ 2150 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=128" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" + + - conc-list: [ 2150 ] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=0" + +dsr1-fp8-gb200-dynamo-sglang: + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: gb200 + precision: fp8 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # "Top of curve" (2 prefill workers each at DEP8 and 1 decode worker at DEP32) + - spec-decoding: "none" + conc-list: [ 4096 ] + prefill: + num-worker: 2 + # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: + # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + - "N_ADDITIONAL_FRONTENDS=9" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" + + # "Bottom of curve" (1 prefill worker at DEP4 and 4 decode workers at DEP4) + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 64, 128 ] + prefill: + num-worker: 1 + # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: + # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=9" + - "SCRIPT_MODE=1p_4d" + decode: + num-worker: 4 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=4" + + # "Middle of curve" (3 prefill workers each at DEP8 and 1 decode worker at DEP48) + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096 ] + prefill: + num-worker: 3 + # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: + # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=6" + - "N_ADDITIONAL_FRONTENDS=9" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=12" + + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] + prefill: + num-worker: 6 + # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: + # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=12" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=6" diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 003b8809f..99a33959a 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -1,72 +1,104 @@ # How to Test Workflows -In order to test configurations described in `.github/configs`, the primary workflow file used is `.github/workflows/e2e-tests.yml`. As input, this workflow takes in the CLI arguments for the `utils/matrix-logic/generate_sweep_configs.py` script. The usage for this script is shown below: +In order to test configurations described in `.github/configs`, the primary workflow file used is `.github/workflows/e2e-tests.yml`. As input, this workflow takes in the CLI arguments for the `utils/matrix_logic/generate_sweep_configs.py` script. The usage for this script is shown below: ``` -usage: generate_sweep_configs.py [-h] {full-sweep,test-config,runner-model-sweep,runner-sweep,custom} ... +usage: generate_sweep_configs.py [-h] {full-sweep,runner-model-sweep} ... Generate benchmark configurations from YAML config files positional arguments: - {full-sweep,test-config,runner-model-sweep,runner-sweep,custom} + {full-sweep,runner-model-sweep} Available commands - full-sweep Generate full sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths - test-config Given a config key, run that configuration as specified. Optionally specify --test-mode to only run one parallelism-concurrency pair for the config. - runner-model-sweep Given a runner type, find all configurations matching the type, and run that configuration on all individual runner nodes for the specified runner type. This is meant to validate - that all runner nodes work on all configurations for a runner type. For instance, to validate that all configs that specify an h200 runner successfully run across all h200 runner - nodes. - runner-sweep Given a model (and optionally a precision and framework), find all configurations matching the inputs, and run those configurations across all compatible runner nodes. This is - meant to validate all runner nodes that should run a particular model can. For instance, this should be used to validate that all runners nodes that should run gptoss-120b - actually do so successfully. - custom Enter custom values + full-sweep Generate full sweep configurations with optional + filtering by model, precision, framework, runner type, + and sequence lengths + runner-model-sweep Given a runner type, find all configurations matching + the type, and run that configuration on all individual + runner nodes for the specified runner type. This is + meant to validate that all runner nodes work on all + configurations for a runner type. For instance, to + validate that all configs that specify an h200 runner + successfully run across all h200 runner nodes. options: -h, --help show this help message and exit ``` -Instead of explaining each command at a high level, let's just walk through some common testing scenarios and describe how to run them. +## `full-sweep` Command -**Scenario 1**: I want to change increase the concurrency from 128 to 256 in the 1k1k scenario for the `dsr1-fp4-b200-sglang` config (from `.github/configs/nvidia-master.yaml`) and then test it. +The `full-sweep` command generates benchmark configurations with optional filtering. It requires specifying either `--single-node` or `--multi-node`. -Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the text following command as the text input: ``` -test-config --key dsr1-fp4-b200-sglang --seq-len 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +usage: generate_sweep_configs.py full-sweep + --config-files CONFIG_FILES [CONFIG_FILES ...] + --runner-config RUNNER_CONFIG + [--model-prefix MODEL_PREFIX [MODEL_PREFIX ...]] + [--precision PRECISION [PRECISION ...]] + [--framework FRAMEWORK [FRAMEWORK ...]] + [--runner-type RUNNER_TYPE [RUNNER_TYPE ...]] + [--seq-lens {1k1k,1k8k,8k1k} [{1k1k,1k8k,8k1k} ...]] + [--step-size STEP_SIZE] + [--max-conc MAX_CONC] + [--max-tp MAX_TP] + [--max-ep MAX_EP] + (--single-node | --multi-node) ``` -Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986046399 +### Examples -If we wanted to also test 1k8k or 8k1k scenarios, we would simply append `1k8k` or `8k1k` to `--seq-len`, respectively. - -Further, if we wanted to run that config on *one specific* runner node, we could specify that by appending `--runner-node` to the argument list. Note that if the specified runner node is not compatible with the specified config key (as dictated by `.github/configs/runners.yaml`), then the workflow will error: +**Test all single-node gptoss configurations on B200 with 1k1k sequence lengths:** +``` +full-sweep --single-node --model-prefix gptoss --runner-type b200 --seq-lens 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +``` +**Test all single-node fp8 precision configs for 1k8k workloads:** +``` +full-sweep --single-node --precision fp8 --seq-lens 1k8k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml ``` -test-config --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml --key dsr1-fp4-b200-sglang --seq-len 1k1k --runner-node mi300x-amd_0 -ValueError: Runner node 'mi300x-amd_0' is not compatible with config 'dsr1-fp4-b200-sglang' which runs on runner type 'b200'. Available runner nodes for this config are 'b200-nb_0, b200-nb_1, b200-nvd_0, b200-nvd_1, b200-nvd_2, b200-nvd_3, b200-tg_0'. +**Test all single-node TRT configs on H200 runners:** +``` +full-sweep --single-node --framework trt --runner-type h200 b200-trt --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml ``` -Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986053019/job/54229839736 +**Test specific single-node model on specific hardware with specific sequence lengths:** +``` +full-sweep --single-node --model-prefix dsr1 --runner-type b200 --precision fp4 --framework sglang --seq-lens 1k1k 8k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +``` -**Scenario 2**: I just made a change to the `benchmarks/dsr1_fp8_b200_docker.sh` and I need to verify that these changes work across all B200 runners. +**Limit concurrency and parallelism for faster testing:** +``` +full-sweep --single-node --max-conc 64 --max-tp 4 --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +``` -Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the text following command as the text input: +**Test all multi-node configurations:** ``` -runner-sweep --runner-type b200 --model-prefix dsr1 --precision fp8 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +full-sweep --multi-node --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml ``` -Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986283169 +## `runner-model-sweep` Command + +The `runner-model-sweep` command validates that all runner nodes of a specific type work with all model configurations. It requires specifying either `--single-node` or `--multi-node`. + +``` +usage: generate_sweep_configs.py runner-model-sweep + --config-files CONFIG_FILES [CONFIG_FILES ...] + --runner-config RUNNER_CONFIG + --runner-type RUNNER_TYPE + [--runner-node-filter RUNNER_NODE_FILTER] + (--single-node | --multi-node) +``` -This will run a test (just the highest available parallelism and lowest available concurrency) for each B200 runner node for each Deepseek config that runs on B200 with fp8 precision. I.e., this can be used to "sweep" across runners for a particular model to test that all runners still work with changes that have been made. +### Scenario: Validating Runner Infrastructure -**Scenario 3**: I just upgraded the CUDA drivers on all H200 runners and need to verify that all models that use H200 still work correctly across all H200 nodes. +I just upgraded the CUDA drivers on all H200 runners and need to verify that all models that use H200 still work correctly across all H200 nodes. Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the following command as the text input: ``` -runner-model-sweep --runner-type h200 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +runner-model-sweep --single-node --runner-type h200 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml ``` -Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986292917 - This will run a test (just the highest available parallelism and lowest available concurrency) for each configuration that specifies the `h200` runner type, across all H200 runner nodes defined in `.github/configs/runners.yaml`. For example, if you have configs `dsr1-fp8-h200-sglang`, `dsr1-fp8-h200-trt`, and `gptoss-fp4-h200-vllm` that all use `runner: h200`, and you have 8 H200 nodes (`h200-cw_0`, `h200-cw_1`, etc.), this will run all 3 configs on all 8 nodes (24 total test runs). @@ -76,49 +108,110 @@ This is particularly useful when: - You've added new runner nodes and want to validate they work with all existing model configurations - You want to verify that all models remain compatible with a specific GPU type after system updates -**Key difference from Scenario 2**: -- `runner-sweep`: Fix a **model**, sweep across runners → "Does this model work on all its runners?" -- `runner-model-sweep`: Fix a **runner type**, sweep across models → "Do all models work on this runner type?" +### Filtering Runner Nodes -## Additional Use Cases with `full-sweep` - -The `full-sweep` command supports multiple filters that can be combined for targeted testing: - -**Test all gptoss configurations on B200 with 1k1k sequence lengths:** +Use `--runner-node-filter` to only test a subset of runner nodes: ``` -full-sweep --model-prefix gptoss --runner-type b200 --seq-lens 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +runner-model-sweep --single-node --runner-type mi300x --runner-node-filter mi300x-amd --config-files .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml ``` -**Test all fp8 precision configs across all runners for 1k8k workloads:** -``` -full-sweep --precision fp8 --seq-lens 1k8k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml -``` +This will only include runner nodes whose names contain "mi300x-amd" -**Test all TRT configs on H200 runners:** -``` -full-sweep --framework trt --runner-type h200 b200-trt --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml -``` +## Validation Architecture -**Quick smoke test of all configs (highest TP, lowest concurrency only):** -``` -full-sweep --test-mode --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml +The benchmarking system uses a strict validation methodology to ensure correctness at every stage. This is implemented in `utils/matrix_logic/validation.py` using Pydantic models. + +### Validation Methodology + +The system validates **both ends** of the configuration pipeline: + +1. **Input Validation (Master Configs)**: Validates the structure of `.github/configs/*.yaml` files before any processing occurs +2. **Output Validation (Matrix Entries)**: Validates the generated matrix entries that are passed to workflow templates + +This dual-validation approach ensures: +- No malformed configurations enter the pipeline +- No invalid parameters reach the benchmark workflows +- Workflow templates (`benchmark-tmpl.yml`, `benchmark-multinode-tmpl.yml`) can assume all inputs are valid—no runtime validation needed + +### Input Validation: Master Config Files + +Master config files (e.g., `nvidia-master.yaml`, `amd-master.yaml`) are validated against strict Pydantic schemas: + +- **`SingleNodeMasterConfigEntry`**: Validates single-node configurations +- **`MultiNodeMasterConfigEntry`**: Validates multi-node configurations + +Each config must specify: +- Required fields: `image`, `model`, `model-prefix`, `precision`, `framework`, `runner`, `multinode` +- Sequence length configs with search spaces defining TP, EP, concurrency ranges, etc. +- Optional fields like `disagg`, `spec-decoding`, `dp-attn` + +Invalid or missing fields raise immediate validation errors before any matrix generation. + +### Output Validation: Matrix Entries + +Generated matrix entries (the actual workflow inputs) are validated against: + +- **`SingleNodeMatrixEntry`**: Matches the inputs expected by `benchmark-tmpl.yml` +- **`MultiNodeMatrixEntry`**: Matches the inputs expected by `benchmark-multinode-tmpl.yml` + +These Pydantic models mirror the workflow template input definitions exactly. For example, `benchmark-tmpl.yml` expects: +```yaml +inputs: + runner: required + image: required + model: required + model-prefix: required + precision: required + framework: required + ... ``` -**Test specific model on specific hardware with specific sequence lengths:** +The corresponding `SingleNodeMatrixEntry` enforces these same fields with appropriate types. + +### Key Design Principles + +1. **No defaults in output validation**: Matrix entry models don't set defaults. Missing values must fail validation rather than silently using fallbacks. + +2. **`extra='forbid'`**: Unknown fields are rejected, preventing typos or deprecated fields from slipping through. + +3. **Strict typing**: Fields like `spec-decoding` use `Literal["mtp", "draft_model", "none"]` to restrict values to known options. + +4. **Concurrency validation**: The system ensures either `conc-list` OR `conc-start`/`conc-end` is provided, but not both. + +### Validation Flow + ``` -full-sweep --model-prefix dsr1 --runner-type b200 --precision fp4 --framework sglang --seq-lens 1k1k 8k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +.github/configs/*.yaml + │ + ▼ +┌─────────────────────────┐ +│ validate_master_config │ ← Input validation (Pydantic) +└─────────────────────────┘ + │ + ▼ +┌─────────────────────────┐ +│ generate_sweep_configs │ ← Matrix generation +└─────────────────────────┘ + │ + ▼ +┌─────────────────────────┐ +│ validate_matrix_entry │ ← Output validation (Pydantic) +└─────────────────────────┘ + │ + ▼ + benchmark-tmpl.yml or + benchmark-multinode-tmpl.yml ``` -## Custom One-off Tests +## Utility Scripts -**Scenario 4**: I want to run a quick test with a custom image, model, or configuration that isn't in the config files yet. +### `utils/summarize.py` -Use the `custom` command to specify all parameters manually: -``` -custom --runner-label b200-nb_0 --image vllm/vllm-openai:v0.11.0 --model meta-llama/Llama-3.1-70B --framework vllm --precision fp8 --exp-name llama70b_test --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +Aggregates benchmark results from a directory of JSON files and outputs a markdown summary table. Used after `collect-results.yml` downloads all artifacts. + +Usage: +```bash +python utils/summarize.py ``` -This runs a single 1k1k test job with your custom parameters on the specified runner node. Useful for: -- Testing new images before adding them to config files -- Quick validation of new models -- Experimenting with different frameworks or precisions +Outputs GitHub-flavored markdown tables with metrics including TTFT, TPOT, interactivity, E2EL, and throughput per GPU for both single-node and multi-node results. diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 879912d34..dae671e29 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -12,6 +12,9 @@ on: model: required: true type: string + model-prefix: + required: true + type: string framework: required: true type: string @@ -27,27 +30,81 @@ on: osl: required: true type: string + conc-list: + required: true + type: string + spec-decoding: + required: true + type: string + disagg: + required: true + type: string + max-model-len: required: true type: string random-range-ratio: required: false type: string - default: '0.8' - mtp-mode: + default: "0.8" + + prefill-num-worker: + required: true + type: string + prefill-tp: + required: true + type: string + prefill-ep: + required: true + type: string + prefill-dp-attn: required: true type: string + prefill-additional-settings: + required: false + type: string + default: "[]" + + decode-num-worker: + required: true + type: string + decode-tp: + required: true + type: string + decode-ep: + required: true + type: string + decode-dp-attn: + required: true + type: string + decode-additional-settings: + required: false + type: string + default: "[]" env: EXP_NAME: ${{ inputs.exp-name }} IMAGE: ${{ inputs.image }} + MODEL_PREFIX: ${{ inputs.model-prefix }} FRAMEWORK: ${{ inputs.framework }} PRECISION: ${{ inputs.precision }} ISL: ${{ inputs.isl }} OSL: ${{ inputs.osl }} MAX_MODEL_LEN: ${{ inputs.max-model-len }} RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }} - MTP_MODE: ${{ inputs.mtp-mode }} + CONC_LIST: ${{ join(fromJson(inputs.conc-list), ' ') }} + SPEC_DECODING: ${{ inputs.spec-decoding }} + DISAGG: ${{ inputs.disagg }} + + PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }} + PREFILL_TP: ${{ inputs.prefill-tp }} + PREFILL_EP: ${{ inputs.prefill-ep }} + PREFILL_DP_ATTN: ${{ inputs.prefill-dp-attn }} + + DECODE_NUM_WORKERS: ${{ inputs.decode-num-worker }} + DECODE_TP: ${{ inputs.decode-tp }} + DECODE_EP: ${{ inputs.decode-ep }} + DECODE_DP_ATTN: ${{ inputs.decode-dp-attn }} permissions: contents: read @@ -56,7 +113,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 480 - name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} mtp-${{ inputs.mtp-mode }}' + name: "${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} specdecod-${{ inputs.spec-decoding }}" steps: - name: Resource cleanup @@ -76,8 +133,10 @@ jobs: - name: Launch multi-node job script env: RUNNER_NAME: ${{ runner.name }} - RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_mtp-${{ env.MTP_MODE }}_${{ runner.name }} + RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_ptp${{ env.PREFILL_TP }}pep${{ env.PREFILL_EP}}_dtp${{ env.DECODE_TP}}dep${{ env.DECODE_EP }}_${{ env.FRAMEWORK }}_specdec-${{ env.SPEC_DECODING }}_conc${{ join(fromJson(inputs.conc-list), 'x') }}_${{ runner.name }} run: | + set -x + export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }} bash ./runners/launch_${RUNNER_NAME%%_*}.sh # Check if at least one result file was created if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then @@ -103,7 +162,7 @@ jobs: if [ -n "$gpus" ]; then echo "Extracted: gpus=$gpus, prefill_gpus=$prefill_gpus, decode_gpus=$decode_gpus" - TP=$gpus RESULT_FILENAME=${result_file%.json} EP_SIZE=1 DP_ATTENTION=false PREFILL_GPUS="$prefill_gpus" DECODE_GPUS="$decode_gpus" python3 utils/process_result.py + RESULT_FILENAME=${result_file%.json} IS_MULTINODE=true PREFILL_GPUS="$prefill_gpus" DECODE_GPUS="$decode_gpus" python3 utils/process_result.py fi fi done diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 2ea94ed55..29dbb35e3 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -11,6 +11,9 @@ on: model: required: true type: string + model-prefix: + required: true + type: string precision: required: true type: string @@ -41,6 +44,12 @@ on: conc: required: true type: string + spec-decoding: + required: true + type: string + disagg: + required: true + type: string random-range-ratio: required: false type: string @@ -51,6 +60,7 @@ env: HF_HUB_CACHE: '/mnt/hf_hub_cache/' EXP_NAME: ${{ inputs.exp-name }} MODEL: ${{ inputs.model }} + MODEL_PREFIX: ${{ inputs.model-prefix }} ISL: ${{ inputs.isl }} OSL: ${{ inputs.osl }} MAX_MODEL_LEN: ${{ inputs.max-model-len }} @@ -62,6 +72,8 @@ env: EP_SIZE: ${{ inputs.ep }} DP_ATTENTION: ${{ inputs.dp-attn }} CONC: ${{ inputs.conc }} + SPEC_DECODING: ${{ inputs.spec-decoding }} + DISAGG: ${{ inputs.disagg }} permissions: contents: read diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index 8c0487320..ca5de7fda 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -29,7 +29,9 @@ jobs: pattern: ${{ inputs.exp-name && format('{0}_*', inputs.exp-name) || '*' }} - name: Print summary - run: python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY + run: | + pip install tabulate + python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY - name: Aggregate results run: python3 utils/collect_results.py results/ ${{ inputs.exp-name || 'all' }} @@ -39,15 +41,3 @@ jobs: with: name: results_${{ inputs.exp-name || 'all' }} path: agg_${{ inputs.exp-name || 'all' }}.json - - - name: Plot performance - run: | - pip install -q matplotlib - python3 utils/plot_perf.py results/ ${{ inputs.exp-name || 'all' }} - - name: Upload performance graphs - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 - with: - name: graphs_${{ inputs.exp-name || 'all' }} - path: | - tput_vs_intvty_*_${{ inputs.exp-name || 'all' }}.png - tput_vs_e2el_*_${{ inputs.exp-name || 'all' }}.png diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 378e58848..7025a2a5c 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -1,5 +1,5 @@ name: End-to-End Tests -run-name: e2e Test - ${{ github.event.inputs.generate-cli-command }} +run-name: e2e Test - ${{ inputs.test-name || github.event.inputs.generate-cli-command }} on: workflow_dispatch: @@ -8,6 +8,10 @@ on: description: "Command passed to generate matrix script" required: true type: string + test-name: + description: "Name for this test run" + required: false + type: string jobs: get-jobs: @@ -21,13 +25,57 @@ jobs: - id: get-jobs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py ${{ inputs.generate-cli-command }}) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \ + ${{ inputs.generate-cli-command }} \ + --runner-config .github/configs/runners.yaml \ + --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - test-sweep: + test-sweep-multi-node: + needs: get-jobs + # Use existence (or non-existence) of 'prefill' field as a proxy to determined multi-node tests. + # We only need to check the first entry because by design, all entries in the matrix will be of the same type (there will + # never be a mix of multi-node and single-node entries as output from generate_sweep_configs.py). + if: ${{ needs.get-jobs.outputs.search-space-config != '[]' && fromJson(needs.get-jobs.outputs.search-space-config)[0].prefill != null }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: ${{ inputs.generate-cli-command }} multi-node + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} + secrets: inherit + with: + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + + test-sweep-single-node: needs: get-jobs + if: ${{ needs.get-jobs.outputs.search-space-config != '[]' && fromJson(needs.get-jobs.outputs.search-space-config)[0].prefill == null }} uses: ./.github/workflows/benchmark-tmpl.yml - name: ${{ inputs.generate-cli-command }} + name: ${{ inputs.generate-cli-command }} single-node strategy: fail-fast: false matrix: @@ -41,15 +89,18 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} collect-results: - needs: test-sweep + needs: [test-sweep-multi-node, test-sweep-single-node] if: ${{ always() }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index f97cd093c..dabb334f6 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -2,14 +2,13 @@ name: "Full Sweep Scheduler - 1k1k" on: workflow_dispatch: - schedule: - - cron: "0 0 * * *" jobs: get-dsr1-configs: runs-on: ubuntu-latest outputs: - search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }} + multi-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.multi-node-search-space-config }} + single-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.single-node-search-space-config }} steps: - name: Checkout code uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 @@ -17,13 +16,16 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT get-gptoss-configs: runs-on: ubuntu-latest outputs: - search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }} + multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }} + single-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.single-node-search-space-config }} steps: - name: Checkout code uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 @@ -31,120 +33,169 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT - benchmark-dsr1: + benchmark-dsr1-multi-node: needs: get-dsr1-configs - uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 1k1k / + if: ${{ needs.get-dsr1-configs.outputs.multi-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: dsr1 1k1k multi-node / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} + config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }} secrets: inherit with: - exp-name: "dsr1_1k1k" isl: 1024 osl: 1024 - max-model-len: 2048 + max-model-len: 2248 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - benchmark-gptoss: - needs: get-gptoss-configs + exp-name: "dsr1_1k1k" + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + + benchmark-dsr1-single-node: + needs: get-dsr1-configs + if: ${{ needs.get-dsr1-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k1k / + name: dsr1 1k1k single-node / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }} + config: ${{ fromJson(needs.get-dsr1-configs.outputs.single-node-search-space-config) }} secrets: inherit with: - exp-name: "gptoss_1k1k" + exp-name: "dsr1_1k1k" isl: 1024 osl: 1024 - max-model-len: 2048 + max-model-len: 2248 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} - # This is a workaround until we can integrate GB200 into master configs. - benchmark-gb200: + benchmark-gptoss-multi-node: + needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k1k sweep / + name: gptoss 1k1k multi-node / strategy: fail-fast: false matrix: - config: - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "off", - } - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "on", - } - - { - "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", - "model": "deepseek-ai/DeepSeek-R1-0528", - "model-prefix": "dsr1", - "precision": "fp8", - "framework": "dynamo-sglang", - "mtp": "off", - } + config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }} secrets: inherit with: - runner: gb200 + isl: 1024 + osl: 1024 + max-model-len: 2248 + runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_1k1k + exp-name: "dsr1_1k1k" + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + + benchmark-gptoss-single-node: + needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: gptoss 1k1k single-node / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-gptoss-configs.outputs.single-node-search-space-config) }} + secrets: inherit + with: + exp-name: "gptoss_1k1k" isl: 1024 osl: 1024 - max-model-len: 2048 - mtp-mode: ${{ matrix.config.mtp }} + max-model-len: 2248 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} collect-dsr1-results: - needs: [benchmark-dsr1, benchmark-gb200] - if: ${{ always() }} + needs: + [ + get-dsr1-configs, + benchmark-dsr1-single-node, + benchmark-dsr1-multi-node, + ] + if: ${{ always() && needs.get-dsr1-configs.result == 'success' && needs.get-dsr1-configs.outputs.single-node-search-space-config != '[]' && needs.get-dsr1-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: exp-name: "dsr1_1k1k" collect-gptoss-results: - needs: benchmark-gptoss - if: ${{ always() }} + needs: + [ + get-gptoss-configs, + benchmark-gptoss-single-node, + benchmark-gptoss-multi-node, + ] + if: ${{ always() && needs.get-gptoss-configs.result == 'success' && needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' && needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: exp-name: "gptoss_1k1k" calc-success-rate: - needs: [benchmark-dsr1, benchmark-gptoss, benchmark-gb200] + needs: [collect-dsr1-results, collect-gptoss-results] if: ${{ always() }} runs-on: ubuntu-latest diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index cd8c07c74..015586de8 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -2,14 +2,13 @@ name: "Full Sweep Scheduler - 1k8k" on: workflow_dispatch: - schedule: - - cron: "0 0 * * *" jobs: get-dsr1-configs: runs-on: ubuntu-latest outputs: - search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }} + multi-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.multi-node-search-space-config }} + single-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.single-node-search-space-config }} steps: - name: Checkout code uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 @@ -17,13 +16,16 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT get-gptoss-configs: runs-on: ubuntu-latest outputs: - search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }} + multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }} + single-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.single-node-search-space-config }} steps: - name: Checkout code uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 @@ -31,75 +33,169 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT - benchmark-dsr1: + benchmark-dsr1-multi-node: needs: get-dsr1-configs + if: ${{ needs.get-dsr1-configs.outputs.multi-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: dsr1 1k8k multi-node / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }} + secrets: inherit + with: + isl: 1024 + osl: 8192 + max-model-len: 9416 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: "dsr1_1k8k" + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + + benchmark-dsr1-single-node: + needs: get-dsr1-configs + if: ${{ needs.get-dsr1-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 1k8k / + name: dsr1 1k8k single-node / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} + config: ${{ fromJson(needs.get-dsr1-configs.outputs.single-node-search-space-config) }} secrets: inherit with: exp-name: "dsr1_1k8k" isl: 1024 osl: 8192 - max-model-len: 9216 + max-model-len: 9416 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} - benchmark-gptoss: + benchmark-gptoss-multi-node: + needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: gptoss 1k8k multi-node / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }} + secrets: inherit + with: + isl: 1024 + osl: 8192 + max-model-len: 9416 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: "dsr1_1k8k" + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + + benchmark-gptoss-single-node: needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k8k / + name: gptoss 1k8k single-node / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }} + config: ${{ fromJson(needs.get-gptoss-configs.outputs.single-node-search-space-config) }} secrets: inherit with: exp-name: "gptoss_1k8k" isl: 1024 osl: 8192 - max-model-len: 9216 + max-model-len: 9416 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} collect-dsr1-results: - needs: benchmark-dsr1 - if: ${{ always() }} + needs: + [ + get-dsr1-configs, + benchmark-dsr1-single-node, + benchmark-dsr1-multi-node, + ] + if: ${{ always() && needs.get-dsr1-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: exp-name: "dsr1_1k8k" collect-gptoss-results: - needs: benchmark-gptoss - if: ${{ always() }} + needs: + [ + get-gptoss-configs, + benchmark-gptoss-single-node, + benchmark-gptoss-multi-node, + ] + if: ${{ always() && needs.get-gptoss-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: exp-name: "gptoss_1k8k" calc-success-rate: - needs: [benchmark-dsr1, benchmark-gptoss] + needs: [collect-dsr1-results, collect-gptoss-results] if: ${{ always() }} runs-on: ubuntu-latest diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index 036794eef..7022900c7 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -2,14 +2,13 @@ name: "Full Sweep Scheduler - 8k1k" on: workflow_dispatch: - schedule: - - cron: "0 0 * * *" jobs: get-dsr1-configs: runs-on: ubuntu-latest outputs: - search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }} + multi-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.multi-node-search-space-config }} + single-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.single-node-search-space-config }} steps: - name: Checkout code uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 @@ -17,13 +16,16 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT get-gptoss-configs: runs-on: ubuntu-latest outputs: - search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }} + multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }} + single-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.single-node-search-space-config }} steps: - name: Checkout code uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 @@ -31,120 +33,169 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT - benchmark-dsr1: + benchmark-dsr1-multi-node: needs: get-dsr1-configs - uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 8k1k / + if: ${{ needs.get-dsr1-configs.outputs.multi-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: dsr1 8k1k multi-node / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} + config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }} secrets: inherit with: - exp-name: "dsr1_8k1k" isl: 8192 osl: 1024 - max-model-len: 9216 + max-model-len: 9416 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - benchmark-gptoss: - needs: get-gptoss-configs + exp-name: "dsr1_8k1k" + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + + benchmark-dsr1-single-node: + needs: get-dsr1-configs + if: ${{ needs.get-dsr1-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 8k1k / + name: dsr1 8k1k single-node / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }} + config: ${{ fromJson(needs.get-dsr1-configs.outputs.single-node-search-space-config) }} secrets: inherit with: - exp-name: "gptoss_8k1k" + exp-name: "dsr1_8k1k" isl: 8192 osl: 1024 - max-model-len: 9216 + max-model-len: 9416 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} - # This is a workaround until we can integrate GB200 into master configs. - benchmark-gb200: + benchmark-gptoss-multi-node: + needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 8k1k sweep / + name: gptoss 8k1k multi-node / strategy: fail-fast: false matrix: - config: - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "off", - } - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "on", - } - - { - "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", - "model": "deepseek-ai/DeepSeek-R1-0528", - "model-prefix": "dsr1", - "precision": "fp8", - "framework": "dynamo-sglang", - "mtp": "off", - } + config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }} secrets: inherit with: - runner: gb200 + isl: 8192 + osl: 1024 + max-model-len: 9416 + runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_8k1k + exp-name: "dsr1_8k1k" + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + + benchmark-gptoss-single-node: + needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: gptoss 8k1k single-node / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-gptoss-configs.outputs.single-node-search-space-config) }} + secrets: inherit + with: + exp-name: "gptoss_8k1k" isl: 8192 osl: 1024 - max-model-len: 9216 - mtp-mode: ${{ matrix.config.mtp }} + max-model-len: 9416 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} collect-dsr1-results: - needs: [benchmark-dsr1, benchmark-gb200] - if: ${{ always() }} + needs: + [ + get-dsr1-configs, + benchmark-dsr1-single-node, + benchmark-dsr1-multi-node, + ] + if: ${{ always() && needs.get-dsr1-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: exp-name: "dsr1_8k1k" collect-gptoss-results: - needs: benchmark-gptoss - if: ${{ always() }} + needs: + [ + get-gptoss-configs, + benchmark-gptoss-single-node, + benchmark-gptoss-multi-node, + ] + if: ${{ always() && needs.get-gptoss-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: exp-name: "gptoss_8k1k" calc-success-rate: - needs: [benchmark-dsr1, benchmark-gptoss, benchmark-gb200] + needs: [collect-dsr1-results, collect-gptoss-results] if: ${{ always() }} runs-on: ubuntu-latest diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml deleted file mode 100644 index f9664be19..000000000 --- a/.github/workflows/full-sweep-test.yml +++ /dev/null @@ -1,445 +0,0 @@ -name: Test - Full Sweep - -on: - workflow_dispatch: - inputs: - run_1k1k: - type: boolean - required: false - run_8k1k: - type: boolean - required: false - run_1k8k: - type: boolean - required: false - use_h100: - type: boolean - required: false - use_h200: - type: boolean - required: false - use_b200: - type: boolean - required: false - use_mi300x: - type: boolean - required: false - use_mi325x: - type: boolean - required: false - use_mi355x: - type: boolean - required: false - use_gb200: - type: boolean - required: false - -jobs: - get-configs: - runs-on: ubuntu-latest - outputs: - dsr1-1k1k: ${{ steps.generate-configs.outputs.dsr1-1k1k }} - dsr1-1k8k: ${{ steps.generate-configs.outputs.dsr1-1k8k }} - dsr1-8k1k: ${{ steps.generate-configs.outputs.dsr1-8k1k }} - gptoss-1k1k: ${{ steps.generate-configs.outputs.gptoss-1k1k }} - gptoss-1k8k: ${{ steps.generate-configs.outputs.gptoss-1k8k }} - gptoss-8k1k: ${{ steps.generate-configs.outputs.gptoss-8k1k }} - steps: - - name: Checkout code - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - - # This looks complicated, but it is just calling generate_sweep_configs.py conditioned on - # discrete inputs (i.e., run_1k1k, run_h100, etc.) to split the test sweep into discrete jobs - - id: generate-configs - run: | - pip install pydantic - - set -x - # Build runner type filters based on inputs - RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}" - - # DSR1 doesn't support H100, so exclude it - DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs) - - # Generate dsr1 configs (only if we have valid runner types for DSR1) - if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT - else - echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT - else - echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT - else - echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT - fi - - # Generate gptoss configs (only if we have runner types selected) - if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT - else - echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT - else - echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT - else - echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT - fi - - # DSR1 1K1K Benchmarks - benchmark-dsr1-1k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.dsr1-1k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 1k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-dsr1-1k1k-results: - needs: benchmark-dsr1-1k1k - if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "dsr1_1k1k" - - # GPTOSS 1K1K Benchmarks - benchmark-gptoss-1k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.gptoss-1k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-gptoss-1k1k-results: - needs: benchmark-gptoss-1k1k - if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "gptoss_1k1k" - - # DSR1 8K1K Benchmarks - benchmark-dsr1-8k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.dsr1-8k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 8k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.dsr1-8k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-dsr1-8k1k-results: - needs: benchmark-dsr1-8k1k - if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "dsr1_8k1k" - - # GPTOSS 8K1K Benchmarks - benchmark-gptoss-8k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.gptoss-8k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 8k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.gptoss-8k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-gptoss-8k1k-results: - needs: benchmark-gptoss-8k1k - if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "gptoss_8k1k" - - # DSR1 1K8K Benchmarks - benchmark-dsr1-1k8k: - needs: get-configs - if: ${{ needs.get-configs.outputs.dsr1-1k8k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 1k8k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k8k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - # This is a workaround until we can integrate GB200 into master configs. - benchmark-gb200-1k1k: - if: ${{ inputs.use_gb200 && inputs.run_1k1k }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k1k sweep - strategy: - fail-fast: false - matrix: - config: &dsr1_static_configs - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "off", - } - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "on", - } - - { - "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", - "model": "deepseek-ai/DeepSeek-R1-0528", - "model-prefix": "dsr1", - "precision": "fp8", - "framework": "dynamo-sglang", - "mtp": "off", - } - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_1k1k - isl: 1024 - osl: 1024 - max-model-len: 2048 - mtp-mode: ${{ matrix.config.mtp }} - - benchmark-gb200-1k8k: - if: ${{ inputs.use_gb200 && inputs.run_1k8k }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k8k sweep - strategy: - fail-fast: false - matrix: - config: *dsr1_static_configs - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_1k8k - isl: 1024 - osl: 8192 - max-model-len: 9216 - mtp-mode: ${{ matrix.config.mtp }} - - benchmark-gb200-8k1k: - if: ${{ inputs.use_gb200 && inputs.run_8k1k }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 8k1k sweep - strategy: - fail-fast: false - matrix: - config: *dsr1_static_configs - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_8k1k - isl: 1024 - osl: 8192 - max-model-len: 9216 - mtp-mode: ${{ matrix.config.mtp }} - - collect-dsr1-1k8k-results: - needs: - [ - benchmark-dsr1-1k8k, - benchmark-gb200-1k1k, - benchmark-gb200-1k8k, - benchmark-gb200-8k1k, - ] - if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "dsr1_1k8k" - - # GPTOSS 1K8K Benchmarks - benchmark-gptoss-1k8k: - needs: get-configs - if: ${{ needs.get-configs.outputs.gptoss-1k8k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k8k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k8k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-gptoss-1k8k-results: - needs: benchmark-gptoss-1k8k - if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "gptoss_1k8k" - - calc-success-rate: - needs: - [ - collect-dsr1-1k1k-results, - collect-dsr1-1k8k-results, - collect-dsr1-8k1k-results, - collect-gptoss-1k1k-results, - collect-gptoss-1k8k-results, - collect-gptoss-8k1k-results, - ] - if: ${{ always() }} - runs-on: ubuntu-latest - - env: - RESULTS_DIR: "results/" - STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - - steps: - - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - - - name: Download results artifacts - uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 - with: - path: ${{ env.RESULTS_DIR }} - pattern: results_* - - - name: Install python dependencies - run: pip install PyGithub - - - name: Calculate success rate - run: python3 utils/calc_success_rate.py $STATS_FILENAME - - - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 - with: - name: "run-stats" - path: ${{ env.STATS_FILENAME }}.json diff --git a/.github/workflows/gb200-tests.yml b/.github/workflows/gb200-tests.yml deleted file mode 100644 index c700599d9..000000000 --- a/.github/workflows/gb200-tests.yml +++ /dev/null @@ -1,91 +0,0 @@ -name: GB200 Tests - -on: - workflow_dispatch: - inputs: - image: - description: "Serving Image" - required: true - type: choice - options: - - "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1" - - "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3" - - model: - description: "Model" - required: true - type: choice - options: - - "deepseek-ai/DeepSeek-R1-0528" - - "deepseek-r1-fp4" - - precision: - description: "Precision" - required: true - type: choice - options: - - "fp4" - - "fp8" - - framework: - description: "Framework" - required: true - type: choice - options: - - "dynamo-trtllm" - - "dynamo-sglang" - - mtp: - description: "Mtp On/Off" - required: true - type: choice - options: - - "on" - - "off" - - isl: - description: "ISL" - required: true - type: string - - osl: - description: "OSL" - required: true - type: string - -jobs: - pre-run: - runs-on: ubuntu-latest - outputs: - max-model-len: ${{ steps.calc.outputs.max-model-len }} - steps: - - id: calc - shell: python - run: | - import os - import sys - try: - isl = int("${{ inputs.isl }}") - osl = int("${{ inputs.osl }}") - except ValueError: - print("Error: ISL and OSL must be integers") - sys.exit(1) - with open(os.environ['GITHUB_OUTPUT'], 'a') as f: - f.write(f"max-model-len={isl + osl}\n") - - benchmark-gb200: - needs: pre-run - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 test - secrets: inherit - with: - runner: gb200 - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: dsr1_1k1k - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ needs.pre-run.outputs.max-model-len }} - mtp-mode: ${{ inputs.mtp }} diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index ee89ff419..82046ff4a 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -56,12 +56,11 @@ jobs: for label in matching: result = subprocess.run([ - 'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix-logic/generate_sweep_configs.py", + 'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix_logic/generate_sweep_configs.py", 'full-sweep', '--runner-type', label['runner-type'], '--model-prefix', label['model-prefix'], '--seq-lens', '1k1k', - '--test-mode', '--config-files', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml", f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml", @@ -102,12 +101,15 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} collect-results: needs: [get-jobs, validate] diff --git a/.github/workflows/pr-line-counter.yml b/.github/workflows/pr-line-counter.yml index c616493ed..b9f4bf6b1 100644 --- a/.github/workflows/pr-line-counter.yml +++ b/.github/workflows/pr-line-counter.yml @@ -2,9 +2,9 @@ name: PR Line Counter on: pull_request: - types: [opened, synchronize, reopened, ready_for_review] + types: [opened, reopened, ready_for_review] paths: - - 'utils/matrix-logic/**' + - 'utils/matrix_logic/generate_sweep_configs.py' permissions: contents: read @@ -17,7 +17,7 @@ jobs: contents: read pull-requests: write env: - TARGET_FILE: utils/matrix-logic/generate_sweep_configs.py + TARGET_FILE: utils/matrix_logic/generate_sweep_configs.py steps: - name: Checkout code @@ -74,7 +74,7 @@ jobs: - name: Generate summary run: | - echo "## 📊 Line Count Report" >> $GITHUB_STEP_SUMMARY + echo "## Line Count Report" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY FILE_EXISTS="${{ steps.line-count.outputs.file_exists }}" @@ -94,20 +94,20 @@ jobs: echo "**Base Lines:** $BASE_LINE_COUNT" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY if [ "$LINE_DIFF" -gt 0 ]; then - echo "**Change:** +$LINE_DIFF lines 📈" >> $GITHUB_STEP_SUMMARY + echo "**Change:** +$LINE_DIFF lines" >> $GITHUB_STEP_SUMMARY elif [ "$LINE_DIFF" -lt 0 ]; then - echo "**Change:** $LINE_DIFF lines 📉" >> $GITHUB_STEP_SUMMARY + echo "**Change:** $LINE_DIFF lines" >> $GITHUB_STEP_SUMMARY else - echo "**Change:** No change ➡️" >> $GITHUB_STEP_SUMMARY + echo "**Change:** No change" >> $GITHUB_STEP_SUMMARY fi else echo "" >> $GITHUB_STEP_SUMMARY echo "**Base Lines:** 0 (new file)" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - echo "**Change:** +$LINE_DIFF lines 📈" >> $GITHUB_STEP_SUMMARY + echo "**Change:** +$LINE_DIFF lines" >> $GITHUB_STEP_SUMMARY fi else - echo "⚠️ **File not found:** \`$TARGET_FILE\`" >> $GITHUB_STEP_SUMMARY + echo "**File not found:** \`$TARGET_FILE\`" >> $GITHUB_STEP_SUMMARY fi - name: Comment on PR diff --git a/.github/workflows/test-matrix-logic.yml b/.github/workflows/test-matrix-logic.yml index bd7ec278a..79011e824 100644 --- a/.github/workflows/test-matrix-logic.yml +++ b/.github/workflows/test-matrix-logic.yml @@ -1,9 +1,10 @@ name: Test Matrix Logic +run-name: "Config Parsing Pytests PR #${{ github.event.pull_request.number }}" on: pull_request: paths: - - 'utils/matrix-logic/**' + - 'utils/matrix_logic/**' permissions: contents: read @@ -29,7 +30,12 @@ jobs: python -m pip install --upgrade pip pip install pytest pydantic pyyaml - - name: Run pytest + - name: test_generate_sweep_configs tests run: | - cd utils/matrix-logic + cd utils/matrix_logic pytest test_generate_sweep_configs.py -v + + - name: test_validation tests + run: | + cd utils/matrix_logic + pytest test_validation.py -v diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index c0dd511ea..2b4c20c72 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -2,6 +2,27 @@ # Shared benchmarking utilities for InferenceMAX +# Check if required environment variables are set +# Usage: check_env_vars VAR1 VAR2 VAR3 ... +# Exits with code 1 if any variable is not set +check_env_vars() { + local missing_vars=() + + for var_name in "$@"; do + if [[ -z "${!var_name}" ]]; then + missing_vars+=("$var_name") + fi + done + + if [[ ${#missing_vars[@]} -gt 0 ]]; then + echo "Error: The following required environment variables are not set:" + for var in "${missing_vars[@]}"; do + echo " - $var" + done + exit 1 + fi +} + # Wait for server to be ready by polling the health endpoint # All parameters are required # Parameters: diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh new file mode 100644 index 000000000..7a105158f --- /dev/null +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -0,0 +1,63 @@ +#!/usr/bin/bash + +set -x + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING \ + PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ + PREFILL_MAX_NUM_TOKENS PREFILL_MAX_BATCH_SIZE DECODE_MAX_NUM_TOKENS \ + DECODE_MAX_BATCH_SIZE DECODE_GPU_MEM_FRACTION + +if [ "$SPEC_DECODING" == "mtp" ]; then + check_env_vars DECODE_MTP_SIZE +else + DECODE_MTP_SIZE="0" +fi + +PERFORMANCE_SWEEPS_PATH="components/backends/trtllm/performance_sweeps" + +echo "Cloning Dynamo repository..." +git clone https://github.com/ai-dynamo/dynamo.git +cd dynamo +git checkout release/0.5.1-rc0.20251105 +git submodule update --init --recursive + +cd "$PERFORMANCE_SWEEPS_PATH" + +# Set up environment variables based on ISL/OSL +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608 +elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then + export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448 +else + echo "Unsupported ISL/OSL combination: $ISL/$OSL" + exit 1 +fi + +kind=dynamo_disagg +additional_slurm_args="--time=04:00:00" +ntasks_per_node=4 + +gen_nodes=$(((DECODE_TP + 3)/4 * DECODE_NUM_WORKERS)) +total_nodes=$((PREFILL_NUM_WORKERS + gen_nodes)) +total_tasks=$((total_nodes * ntasks_per_node)) + +decode_eplb_num_slots=0 + +sbatch --nodes=${total_nodes} \ + --ntasks=${total_tasks} \ + --ntasks-per-node=${ntasks_per_node} \ + --segment=${total_nodes} ${additional_slurm_args} \ + benchmark_disagg.slurm \ + ${PREFILL_NUM_WORKERS} ${PREFILL_TP} \ + ${PREFILL_MAX_BATCH_SIZE} ${PREFILL_MAX_NUM_TOKENS} \ + ${PREFILL_DP_ATTN} ${DECODE_NUM_WORKERS} \ + ${DECODE_TP} ${DECODE_MAX_BATCH_SIZE} \ + ${DECODE_MAX_NUM_TOKENS} ${DECODE_DP_ATTN} \ + ${DECODE_GPU_MEM_FRACTION} ${decode_eplb_num_slots} \ + ${DECODE_MTP_SIZE} "${CONC_LIST}" \ + ${gen_nodes} ${kind} \ + ${MODEL_PATH} ${SERVED_MODEL_NAME} \ + ${IMAGE} ${ISL} ${OSL} \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh new file mode 100644 index 000000000..99e2c7afd --- /dev/null +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -0,0 +1,39 @@ + +#!/bin/bash + +set -x + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ + PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ + PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS SGL_SLURM_JOBS_PATH # SGL_SLURM_JOBS_PATH FIXME + +# Always clone and setup Dynamo +echo "Cloning Dynamo repository..." +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git +else + git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git +fi + +cd "$SGL_SLURM_JOBS_PATH" + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="04:00:00" +export MODEL_PATH=$MODEL_PATH +export CONFIG_DIR=$CONFIG_DIR +export CONTAINER_IMAGE=$IMAGE + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimted by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +bash ./submit_disagg.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $N_ADDITIONAL_FRONTENDS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + $SCRIPT_MODE \ No newline at end of file diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 11ee233ea..d9164469e 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -2,6 +2,7 @@ # This script sets up the environment and launches multi-node benchmarks +set -x # Set up environment variables for SLURM export SLURM_PARTITION="batch" @@ -21,6 +22,14 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then fi export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" + + # FIXME: Another workaround for all the different branching + # THIS NEEDS TO BE STANDARDIZED ASAP + if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" + else + export SGL_SLURM_JOBS_PATH="dynamo/components/backends/sglang/slurm_jobs" + fi else SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" @@ -36,207 +45,22 @@ fi export ISL="$ISL" export OSL="$OSL" -### FRAMEWORK_DIFF_IF_STATEMENT #2 - difference in launching jobs -if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then - - # Set up Dynamo repository path - DYNAMO_PATH="/mnt/lustre01/users/sa-shared/benchmarks/dynamo" - PERFORMANCE_SWEEPS_PATH="$DYNAMO_PATH/components/backends/trtllm/performance_sweeps" - - # Overview: - # The Dynamo repository contains the bench_serving repository as a submodule. - # The submit_disagg.sh script, located at $PERFORMANCE_SWEEPS_PATH, orchestrates the entire benchmarking workflow: - # 1. Launches the Dynamo inference service with the specified configuration. - # 2. Waits for the service to become healthy. - # 3. Initiates benchmarking using the bench_serving tools. - # 4. Monitors all jobs until completion. - # 5. Collects and processes the results. - - # Always clone and setup Dynamo - echo "Cloning Dynamo repository..." - rm -rf "$DYNAMO_PATH" - git clone https://github.com/ai-dynamo/dynamo.git "$DYNAMO_PATH" - cd "$DYNAMO_PATH" - git checkout release/0.5.1-rc0.20251105 - git submodule update --init --recursive - - # Navigate to performance sweeps directory - cd "$PERFORMANCE_SWEEPS_PATH" - - # 1. CACHE_TRANSCEIVER_MAX_NUM_TOKENS controls the max_tokens_in_buffer value - # in cache_transceiver_config of TensorRT-LLM context and generation workers. - # Specifically, it is the max number of tokens the transfer buffer can fit. - - # Set up environment variables based on ISL/OSL - if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608 - elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then - export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448 - else - echo "Unsupported ISL/OSL combination: $ISL/$OSL" - exit 1 - fi - - # Generate benchmark configurations based on ISL/OSL and MTP mode - generate_benchmark_configs() { - local isl="$1" - local osl="$2" - local mtp_mode="$3" - - # Usage: - # ./submit_disagg.sh [ctx_num] [gen_num] [gen_tp_size] [gen_batch_size] [gen_max_num_tokens] [gen_gpu_memory_fraction] [gen_eplb_num_slots] [gen_mtp_size] [gen_concurrency_list]" - # MTP Modes: - # mtp=off - Run without Multi-Token Prediction (gen_mtp_size=0) - # mtp=on - Run with Multi-Token Prediction (gen_mtp_size=1,2,3) - # Execution Modes: - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - # dep - Run Data-Expert Parallel mode (attention_dp=true) - # Parameters for tep/dep modes: - # ctx_num: Number of context nodes - # gen_num: Number of generation nodes - # gen_tp_size: Generation tensor parallel size - # gen_batch_size: Generation batch size - # gen_max_num_tokens: Generation max number of tokens - # gen_gpu_memory_fraction: GPU memory fraction (0.7-0.95) - # gen_mtp_size: Multi-Token Prediction size (0 for mtp=off, 1-3 for mtp=on) - # gen_eplb_num_slots: Expert load balancing slots (0, 256, 288) - # gen_concurrency_list: Concurrency values (space-separated, quoted) - - if [ "$isl" = "1024" ] && [ "$osl" = "1024" ]; then - if [ "$mtp_mode" = "on" ]; then - echo "Running 1k/1k MTP=ON configurations" - - ./submit_disagg.sh "mtp=on" "tep" 1 4 8 32 128 "0.9" 3 0 "1 2 4 8 16 36" - - ./submit_disagg.sh "mtp=on" "dep" 1 1 16 64 256 "0.7" 3 0 "512 1075" - - ./submit_disagg.sh "mtp=on" "dep" 2 1 16 128 256 "0.7" 1 0 "2150" - - ./submit_disagg.sh "mtp=on" "dep" 1 1 32 16 64 "0.6" 3 0 "512" - - ./submit_disagg.sh "mtp=on" "dep" 1 1 8 256 512 "0.8" 1 0 "2252" - else - echo "Running 1k/1k MTP=OFF configurations" - - ./submit_disagg.sh "mtp=off" "tep" 1 4 8 128 128 "0.9" 0 0 "1 2 4 8 16 32 64 141" - - ./submit_disagg.sh "mtp=off" "dep" 1 1 32 32 32 "0.7" 0 0 "1075" - - ./submit_disagg.sh "mtp=off" "dep" 1 1 16 64 64 "0.75" 0 0 "1075" - - ./submit_disagg.sh "mtp=off" "dep" 2 1 16 256 256 "0.75" 0 0 "2048 4300" - - ./submit_disagg.sh "mtp=off" "dep" 1 1 8 512 512 "0.8" 0 0 "4300" - fi - elif [ "$isl" = "8192" ] && [ "$osl" = "1024" ]; then - if [ "$mtp_mode" = "on" ]; then - echo "Running 8k/1k MTP=ON configurations" - - ./submit_disagg.sh "mtp=on" "tep" 1 3 8 16 64 "0.9" 3 0 "1 2 4 8 18" - - ./submit_disagg.sh "mtp=on" "dep" 5 1 32 8 32 "0.7" 3 0 "128 269" - - ./submit_disagg.sh "mtp=on" "dep" 8 1 32 16 64 "0.7" 3 0 "538" - - ./submit_disagg.sh "mtp=on" "dep" 8 1 16 64 256 "0.75" 2 0 "1075" - - ./submit_disagg.sh "mtp=on" "dep" 6 1 8 256 512 "0.8" 1 0 "2150" - else - echo "Running 8k/1k MTP=OFF configurations" - - ./submit_disagg.sh "mtp=off" "tep" 1 3 8 32 32 "0.9" 0 0 "1 2 4 8 16 34" - - ./submit_disagg.sh "mtp=off" "dep" 4 1 32 16 16 "0.7" 0 0 "256 538" - - ./submit_disagg.sh "mtp=off" "dep" 6 1 16 64 64 "0.75" 0 0 "1075" - - ./submit_disagg.sh "mtp=off" "dep" 8 1 16 128 128 "0.75" 0 0 "2150" - - ./submit_disagg.sh "mtp=off" "dep" 5 1 8 256 256 "0.8" 0 0 "2150" - fi - else - echo "Unsupported ISL/OSL combination: $isl/$osl" - exit 1 - fi - } - - # Run all benchmark configurations - generate_benchmark_configs "$ISL" "$OSL" "$MTP_MODE" - -else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" - # Set up Dynamo repository path - set -x - DYNAMO_PATH="/mnt/lustre01/users/sa-shared/benchmarks/dynamo" - if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - SGL_SLURM_JOBS_PATH="$DYNAMO_PATH/examples/backends/sglang/slurm_jobs" - else - SGL_SLURM_JOBS_PATH="$DYNAMO_PATH/components/backends/sglang/slurm_jobs" - fi - - # Always clone and setup Dynamo - echo "Cloning Dynamo repository..." - rm -rf "$DYNAMO_PATH" - if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - # TODO: before merge this will be a different branch off of main - git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git $DYNAMO_PATH - else - git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git $DYNAMO_PATH - fi - - cd "$DYNAMO_PATH" - - # Navigate to corresponding directory - cd "$SGL_SLURM_JOBS_PATH" - - # Set up SGL launch script-specific environment variables - export SLURM_ACCOUNT=$SLURM_ACCOUNT - export SLURM_PARTITION=$SLURM_PARTITION - export TIME_LIMIT="04:00:00" - export MODEL_PATH=$MODEL_PATH - export CONFIG_DIR=$CONFIG_DIR - export CONTAINER_IMAGE=$IMAGE - - # Launch jobs based on ISL/OSL - if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - NUMBER_OF_EXPERIMENTS=3 - - top_of_curve_concurrency_list="4096" - middle_of_curve_concurrency_list="1024x2048x4096" - bottom_of_curve_concurrency_list="2x4x8x16x64x128" - - # Top of curve (2 prefill workers each at DEP8 and 1 decode worker at DEP32) - bash ./submit_disagg.sh 4 2 8 1 9 $ISL $OSL $top_of_curve_concurrency_list inf - - # Bottom of curve (1 prefill worker at DEP4 and 4 decode workers at DEP4) - bash ./submit_disagg.sh 1 1 4 4 9 $ISL $OSL $bottom_of_curve_concurrency_list inf 1p_4d - - # Middle of curve (3 prefill workers each at DEP8 and 1 decode worker at DEP48) - bash ./submit_disagg.sh 6 3 12 1 9 $ISL $OSL $middle_of_curve_concurrency_list inf - - elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then - NUMBER_OF_EXPERIMENTS=1 - - concurrency_list="128x256x384x448x512x576x1024x2048x4096" - bash ./submit_disagg.sh 12 6 6 1 8 $ISL $OSL $concurrency_list inf - else - echo "Unsupported ISL/OSL combination: $ISL/$OSL" - exit 1 - fi - - set +x -fi +bash benchmarks/"${EXP_NAME%%_*}_${PRECISION}_gb200_${FRAMEWORK}_slurm.sh" # Wait for all jobs to complete echo "Waiting for all jobs to complete..." while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do echo "Jobs still running..." - squeue -u $USER - sleep 60 + squeue --steps -u $USER + sleep 30 done -echo "All jobs completed" -### FRAMEWORK_DIFF_IF_STATEMENT #3 - difference in log post-processing -if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then +# FIXME: The below is bad and is a result of the indirection of the ways in which +# Dynamo jobs are launched. In a follow-up PR, the location of the result file should not +# depend on the runner, it should always be in the same spot in the GH workspace. + +# Process results from all configurations +if [[ $FRAMEWORK == "dynamo-trt" ]]; then # Find the logs directory (should be only one for this ISL/OSL combination) LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) @@ -293,23 +117,23 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then echo "Results subdirectory not found: $RESULTS_SUBDIR" fi done - else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement # Find the latest log directory that contains the data cat > collect_latest_results.py <<'PY' import os, sys -isl, osl, nexp = [int(x) for x in sys.argv[1:]] -for path in sorted([f"logs/{name}/vllm_isl_{isl}_osl_{osl}" for name in os.listdir("logs/") if os.path.isdir(f"logs/{name}/vllm_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: +sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) +for path in sorted([f"{sgl_job_dir}/logs/{name}/vllm_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/vllm_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY - LOGS_DIR=$(python3 collect_latest_results.py $ISL $OSL $NUMBER_OF_EXPERIMENTS) + + LOGS_DIR=$(python3 collect_latest_results.py "$SGL_SLURM_JOBS_PATH" $ISL $OSL 1) if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 fi echo "Found logs directory: $LOGS_DIR" - ls $LOGS_DIR + ls -la $LOGS_DIR # Result JSON are contained within the result directory for result_file in $(find $LOGS_DIR -type f); do @@ -324,4 +148,4 @@ PY done fi -echo "All result files processed" +echo "All result files processed" \ No newline at end of file diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py deleted file mode 100644 index f0d9a4390..000000000 --- a/utils/matrix-logic/generate_sweep_configs.py +++ /dev/null @@ -1,968 +0,0 @@ -import json -import yaml -import argparse -from pydantic import BaseModel, Field, ValidationError, ConfigDict -from typing import List - -# Field name constants -# Top-level config fields -FIELD_IMAGE = 'image' -FIELD_MODEL = 'model' -FIELD_MODEL_PREFIX = 'model-prefix' -FIELD_PRECISION = 'precision' -FIELD_FRAMEWORK = 'framework' -FIELD_RUNNER = 'runner' -FIELD_SEQ_LEN_CONFIGS = 'seq-len-configs' - -# Seq-len-config fields -FIELD_ISL = 'isl' -FIELD_OSL = 'osl' -FIELD_SEARCH_SPACE = 'search-space' - -# Search-space/benchmark fields -FIELD_TP = 'tp' -FIELD_CONC_START = 'conc-start' -FIELD_CONC_END = 'conc-end' -FIELD_EP = 'ep' -FIELD_DP_ATTN = 'dp-attn' - -# Matrix entry fields -FIELD_CONC = 'conc' -FIELD_MAX_MODEL_LEN = 'max-model-len' -FIELD_EXP_NAME = 'exp-name' - -seq_len_stoi = { - "1k1k": (1024, 1024), - "1k8k": (1024, 8192), - "8k1k": (8192, 1024) -} - -# Reverse mapping for exp-name generation -seq_len_itos = {v: k for k, v in seq_len_stoi.items()} - - -def seq_len_to_str(isl: int, osl: int) -> str: - """Convert sequence lengths to short string representation. - - Returns the short name (e.g., '1k1k') if it exists in the mapping, - otherwise returns 'isl_osl' format. - """ - return seq_len_itos.get((isl, osl), f"{isl}_{osl}") - - -class MatrixEntry(BaseModel): - """Pydantic model for validating matrix entry structure.""" - model_config = ConfigDict(extra='forbid', populate_by_name=True) - - image: str - model: str - precision: str - framework: str - runner: str - isl: int - osl: int - tp: int - ep: int - dp_attn: bool = Field(alias='dp-attn') - conc: int - max_model_len: int = Field(alias='max-model-len') - exp_name: str = Field(alias='exp-name') - - -def validate_matrix_output(matrix_values: List[dict]) -> List[dict]: - """Validate that matrix_values entries match the expected structure. - - Raises ValueError if any entry fails validation. - Returns the original list if all entries are valid. - """ - for i, entry in enumerate(matrix_values): - try: - MatrixEntry(**entry) - except ValidationError as e: - raise ValueError(f"Matrix entry at index {i} failed validation:\n{e}") - return matrix_values - - -def validate_master_configs_structure(all_config_data): - """Validate the structure of all master config entries. - - This validates that all required fields are present, have correct types, - and no extra fields exist. Should be called once after loading config files. - """ - for key, val in all_config_data.items(): - # Check for required top-level fields and their types - required_fields = { - FIELD_IMAGE: str, - FIELD_MODEL: str, - FIELD_MODEL_PREFIX: str, - FIELD_PRECISION: str, - FIELD_FRAMEWORK: str, - FIELD_RUNNER: str, - FIELD_SEQ_LEN_CONFIGS: list - } - - for field, expected_type in required_fields.items(): - if field not in val or val[field] is None: - raise ValueError( - f"Missing required field '{field}' for key '{key}'") - if not isinstance(val[field], expected_type): - raise ValueError( - f"Field '{field}' must be {expected_type.__name__} for key '{key}', got {type(val[field]).__name__}") - - seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS] - if len(seq_len_configs) == 0: - raise ValueError( - f"'{FIELD_SEQ_LEN_CONFIGS}' must be a non-empty list for key '{key}'") - - # Validate each seq-len-config - for i, seq_config in enumerate(seq_len_configs): - # Check isl - if FIELD_ISL not in seq_config or seq_config[FIELD_ISL] is None: - raise ValueError( - f"Missing '{FIELD_ISL}' in seq-len-config[{i}] for key '{key}'") - if not isinstance(seq_config[FIELD_ISL], int): - raise ValueError( - f"'{FIELD_ISL}' must be int in seq-len-config[{i}] for key '{key}'") - - # Check osl - if FIELD_OSL not in seq_config or seq_config[FIELD_OSL] is None: - raise ValueError( - f"Missing '{FIELD_OSL}' in seq-len-config[{i}] for key '{key}'") - if not isinstance(seq_config[FIELD_OSL], int): - raise ValueError( - f"'{FIELD_OSL}' must be int in seq-len-config[{i}] for key '{key}'") - - bmk_space = seq_config.get(FIELD_SEARCH_SPACE) - if not bmk_space or not isinstance(bmk_space, list) or len(bmk_space) == 0: - raise ValueError( - f"Missing or invalid '{FIELD_SEARCH_SPACE}' in seq-len-config[{i}] for key '{key}'") - - # Validate each benchmark in search-space - for j, bmk in enumerate(bmk_space): - # Define allowed fields - allowed_fields = {FIELD_TP, FIELD_CONC_START, - FIELD_CONC_END, FIELD_EP, FIELD_DP_ATTN} - required_bmk_fields = {FIELD_TP: int, - FIELD_CONC_START: int, FIELD_CONC_END: int} - optional_bmk_fields = {FIELD_EP: int, FIELD_DP_ATTN: bool} - - # Check for extra fields - extra_fields = set(bmk.keys()) - allowed_fields - if extra_fields: - raise ValueError( - f"Extra fields {extra_fields} in search-space[{j}] of seq-len-config[{i}] for key '{key}'") - - # Validate required fields - for field, expected_type in required_bmk_fields.items(): - if field not in bmk or bmk[field] is None: - raise ValueError( - f"Missing '{field}' in search-space[{j}] of seq-len-config[{i}] for key '{key}'") - if not isinstance(bmk[field], expected_type): - raise ValueError( - f"'{field}' must be {expected_type.__name__} in search-space[{j}] of seq-len-config[{i}] for key '{key}'") - - # Validate optional fields if they exist - for field, expected_type in optional_bmk_fields.items(): - if field in bmk and bmk[field] is not None: - if not isinstance(bmk[field], expected_type): - raise ValueError( - f"'{field}' must be {expected_type.__name__} in search-space[{j}] of seq-len-config[{i}] for key '{key}'") - - -def generate_full_sweep(args, all_config_data): - """Generate full sweep configurations with optional filtering. - - Supports filtering by model prefix, precision, framework, runner type, and sequence lengths. - Supports test mode to only run highest TP with lowest concurrency. - - All filters are optional - can generate sweeps for all configs or filter by specific criteria. - - Assumes all_config_data has been validated by validate_config_structure(). - """ - # Validate runner types if specified - if args.runner_type: - if not args.runner_config: - raise ValueError( - "--runner-config is required when --runner-type is specified") - - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - valid_runner_types = set(runner_config.keys()) - invalid_runners = set(args.runner_type) - valid_runner_types - if invalid_runners: - raise ValueError( - f"Invalid runner type(s): {invalid_runners}. " - f"Valid runner types are: {', '.join(sorted(valid_runner_types))}") - - matrix_values = [] - - # Convert seq-lens to set of (isl, osl) tuples for filtering - seq_lens_filter = None - if args.seq_lens: - seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} - - for key, val in all_config_data.items(): - # Filter by model prefix if specified - if args.model_prefix: - if not any(key.startswith(prefix) for prefix in args.model_prefix): - continue - - # Filter by precision if specified - if args.precision and val[FIELD_PRECISION] not in args.precision: - continue - - # Filter by framework if specified - if args.framework and val[FIELD_FRAMEWORK] not in args.framework: - continue - - # Filter by runner type if specified - if args.runner_type and val[FIELD_RUNNER] not in args.runner_type: - continue - - seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS] - image = val[FIELD_IMAGE] - model = val[FIELD_MODEL] - precision = val[FIELD_PRECISION] - framework = val[FIELD_FRAMEWORK] - runner = val[FIELD_RUNNER] - model_code = val[FIELD_MODEL_PREFIX] - - for seq_config in seq_len_configs: - isl = seq_config[FIELD_ISL] - osl = seq_config[FIELD_OSL] - - # Filter by sequence lengths if specified - if seq_lens_filter and (isl, osl) not in seq_lens_filter: - continue - - bmk_space = seq_config[FIELD_SEARCH_SPACE] - - if args.test_mode: - # In test mode, use highest TP with lowest concurrency - highest_tp_bmk = max(bmk_space, key=lambda x: x[FIELD_TP]) - tp = highest_tp_bmk[FIELD_TP] - conc = highest_tp_bmk[FIELD_CONC_START] - ep = highest_tp_bmk.get(FIELD_EP) - dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN) - - seq_len_str = seq_len_to_str(isl, osl) - entry = { - FIELD_IMAGE: image, - FIELD_MODEL: model, - FIELD_PRECISION: precision, - FIELD_FRAMEWORK: framework, - FIELD_RUNNER: runner, - FIELD_ISL: isl, - FIELD_OSL: osl, - FIELD_TP: tp, - FIELD_EP: 1, # Default - FIELD_DP_ATTN: False, # Default - FIELD_CONC: conc, - FIELD_MAX_MODEL_LEN: isl + osl + 200, - FIELD_EXP_NAME: f"{model_code}_{seq_len_str}", - } - - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - else: - # Full sweep mode - for bmk in bmk_space: - tp = bmk[FIELD_TP] - conc_start = bmk[FIELD_CONC_START] - conc_end = bmk[FIELD_CONC_END] - ep = bmk.get(FIELD_EP) - dp_attn = bmk.get(FIELD_DP_ATTN) - - conc = conc_start - while conc <= conc_end: - seq_len_str = seq_len_to_str(isl, osl) - entry = { - FIELD_IMAGE: image, - FIELD_MODEL: model, - FIELD_PRECISION: precision, - FIELD_FRAMEWORK: framework, - FIELD_RUNNER: runner, - FIELD_ISL: isl, - FIELD_OSL: osl, - FIELD_TP: tp, - FIELD_CONC: conc, - FIELD_MAX_MODEL_LEN: isl + osl + 200, - FIELD_EP: 1, # Default - FIELD_DP_ATTN: False, # Default - FIELD_EXP_NAME: f"{model_code}_{seq_len_str}", - } - - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end - - if len(matrix_values) == 0: - error_msg = "No configs found matching filters:" - if args.model_prefix: - error_msg += f" model-prefix={args.model_prefix}" - if args.precision: - error_msg += f" precision={args.precision}" - if args.framework: - error_msg += f" framework={args.framework}" - if args.runner_type: - error_msg += f" runner-type={args.runner_type}" - if seq_lens_filter: - error_msg += f" seq-lens={args.seq_lens}" - raise ValueError(error_msg) - - return matrix_values - - -def generate_test_config(args, all_config_data): - """Generate test configurations for a specific key. - - Assumes all_config_data has been validated by validate_config_structure(). - """ - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError as e: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - val = all_config_data.get(args.key) - - if not val: - raise ValueError( - f"Specified key '{args.key}' does not exist in config files.") - - # Extract model code from config - model_code = val[FIELD_MODEL_PREFIX] - - runner_nodes = runner_config.get(val[FIELD_RUNNER], []) - if args.runner_node and args.runner_node not in runner_nodes: - raise ValueError( - f"Runner node '{args.runner_node}' is not compatible with config '{args.key}' which runs on runner type '{val[FIELD_RUNNER]}'. Available runner nodes for this config are '{', '.join(runner_nodes)}'.") - - seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS] - image = val[FIELD_IMAGE] - model = val[FIELD_MODEL] - precision = val[FIELD_PRECISION] - framework = val[FIELD_FRAMEWORK] - # Use default runner or specific runner node if input by user - runner = val[FIELD_RUNNER] if not args.runner_node else args.runner_node - - # Convert seq-lens to set of (isl, osl) tuples for filtering - seq_lens_filter = None - if args.seq_lens: - seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} - - matrix_values = [] - - # Process each sequence length configuration - for seq_config in seq_len_configs: - isl = seq_config[FIELD_ISL] - osl = seq_config[FIELD_OSL] - - # Filter by sequence lengths if specified - if seq_lens_filter and (isl, osl) not in seq_lens_filter: - continue - - bmk_space = seq_config[FIELD_SEARCH_SPACE] - - for bmk in bmk_space: - tp = bmk[FIELD_TP] - conc_start = bmk[FIELD_CONC_START] - conc_end = bmk[FIELD_CONC_END] - ep = bmk.get(FIELD_EP) - dp_attn = bmk.get(FIELD_DP_ATTN) - - # In test mode, only use the lowest concurrency (conc_start) - if args.test_mode: - entry = { - FIELD_IMAGE: image, - FIELD_MODEL: model, - FIELD_PRECISION: precision, - FIELD_FRAMEWORK: framework, - FIELD_RUNNER: runner, - FIELD_ISL: isl, - FIELD_OSL: osl, - FIELD_TP: tp, - FIELD_EP: 1, # Default, - FIELD_DP_ATTN: False, # Default - FIELD_CONC: conc_start, - FIELD_MAX_MODEL_LEN: isl + osl, - FIELD_EXP_NAME: f"{model_code}_test", - } - - # Add optional fields if they exist - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - else: - # Generate entries for each concurrency value in the range - conc = conc_start - while conc <= conc_end: - seq_len_str = seq_len_to_str(isl, osl) - entry = { - FIELD_IMAGE: image, - FIELD_MODEL: model, - FIELD_PRECISION: precision, - FIELD_FRAMEWORK: framework, - FIELD_RUNNER: runner, - FIELD_ISL: isl, - FIELD_OSL: osl, - FIELD_TP: tp, - FIELD_EP: 1, # Default, - FIELD_DP_ATTN: False, # Default - FIELD_CONC: conc, - FIELD_MAX_MODEL_LEN: isl + osl, - FIELD_EXP_NAME: f"{model_code}_{seq_len_str}", - } - - # Add optional fields if they exist - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end - - return matrix_values - - -def generate_runner_model_sweep_config(args, all_config_data): - """Generate runner-model sweep configurations. - - Assumes all_config_data has been validated by validate_config_structure(). - """ - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError as e: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - runner_nodes = runner_config.get(args.runner_type) - - if not runner_nodes: - raise ValueError( - f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") - - # Filter runner nodes if filter is specified - if args.runner_node_filter: - runner_nodes = [node for node in runner_nodes if args.runner_node_filter in node] - if not runner_nodes: - raise ValueError( - f"No runner nodes found matching filter '{args.runner_node_filter}' for runner type '{args.runner_type}'.") - - matrix_values = [] - for key, val in all_config_data.items(): - # Only consider configs with specified runner - if val[FIELD_RUNNER] != args.runner_type: - continue - - # Get model code for exp_name - model_code = val[FIELD_MODEL_PREFIX] - - # Find 1k1k config - target_config = None - for config in val[FIELD_SEQ_LEN_CONFIGS]: - if config[FIELD_ISL] == 1024 and config[FIELD_OSL] == 1024: - target_config = config - break - - highest_tp_bmk = max(target_config[FIELD_SEARCH_SPACE], key=lambda x: x[FIELD_TP]) - # Since we are just testing, pick the highest TP for this config and just test - # on that TP with the lowest concurrency available - highest_tp = highest_tp_bmk[FIELD_TP] - lowest_conc = highest_tp_bmk[FIELD_CONC_START] - - ep = highest_tp_bmk.get(FIELD_EP) - dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN) - - for node in runner_nodes: - entry = { - FIELD_IMAGE: val[FIELD_IMAGE], - FIELD_MODEL: val[FIELD_MODEL], - FIELD_PRECISION: val[FIELD_PRECISION], - FIELD_FRAMEWORK: val[FIELD_FRAMEWORK], - # Add one entry for each node under specified runner type - FIELD_RUNNER: node, - # Again, just use 1k1k since this is just meant to smoke test all runners - FIELD_ISL: 1024, - FIELD_OSL: 1024, - FIELD_TP: highest_tp, - FIELD_EP: 1, # Default, - FIELD_DP_ATTN: False, # Default - FIELD_CONC: lowest_conc, - FIELD_MAX_MODEL_LEN: 2048, - FIELD_EXP_NAME: f"{model_code}_test", - } - - # Add optional fields if they exist - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - - return matrix_values - - -def generate_custom_test(args): - """Generate single 1k1k job for custom inputs. - """ - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError as e: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - found_runner_label = False - for runner_type, runner_nodes in runner_config.items(): - if args.runner_label == runner_type or args.runner_label in runner_nodes: - found_runner_label = True - - if not found_runner_label: - raise ValueError(f"Unable to find specified runner label '{args.runner_label}'.") - - if not runner_nodes: - raise ValueError( - f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") - - return [ - { - FIELD_IMAGE: args.image, - FIELD_MODEL: args.model, - FIELD_PRECISION: args.precision, - FIELD_FRAMEWORK: args.framework, - FIELD_RUNNER: args.runner_label, - # Again, just use 1k1k since this is just meant to smoke test all runners - FIELD_ISL: 1024, - FIELD_OSL: 1024, - FIELD_TP: 8, - FIELD_EP: 1, - FIELD_DP_ATTN: False, - FIELD_CONC: 4, - FIELD_EXP_NAME: args.exp_name, - FIELD_MAX_MODEL_LEN: 2048, - } - ] - - -def generate_runner_sweep_config(args, all_config_data): - """Generate runner sweep configurations. - - Assumes all_config_data has been validated by validate_config_structure(). - """ - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError as e: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - if not runner_config.get(args.runner_type): - raise ValueError( - f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") - - - matrix_values = [] - for key, val in all_config_data.items(): - # Only consider configs with specified runner - if not key.startswith(args.model_prefix): - continue - - if not val[FIELD_RUNNER] == args.runner_type: - continue - - # Optionally filter by precision and framework - if (args.precision and val[FIELD_PRECISION] != args.precision) or (args.framework and val[FIELD_FRAMEWORK] != args.framework): - continue - - # Get model code for exp_name - model_code = val[FIELD_MODEL_PREFIX] - - runner_nodes = runner_config.get(val[FIELD_RUNNER]) - if not runner_nodes: - raise ValueError( - f"Runner '{val[FIELD_RUNNER]}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") - - # Find 1k1k config - target_config = None - for config in val[FIELD_SEQ_LEN_CONFIGS]: - if config[FIELD_ISL] == 1024 and config[FIELD_OSL] == 1024: - target_config = config - break - - highest_tp_bmk = max(target_config[FIELD_SEARCH_SPACE], key=lambda x: x[FIELD_TP]) - # Since we are just testing, pick the highest TP for this config and just test - # on that TP with the lowest concurrency available - highest_tp = highest_tp_bmk[FIELD_TP] - lowest_conc = highest_tp_bmk[FIELD_CONC_START] - - ep = highest_tp_bmk.get(FIELD_EP) - dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN) - - for node in runner_nodes: - entry = { - FIELD_IMAGE: val[FIELD_IMAGE], - FIELD_MODEL: val[FIELD_MODEL], - FIELD_PRECISION: val[FIELD_PRECISION], - FIELD_FRAMEWORK: val[FIELD_FRAMEWORK], - # Add one entry for each node under specified runner type - FIELD_RUNNER: node, - # Again, just use 1k1k since this is just meant to smoke test all runners - FIELD_ISL: 1024, - FIELD_OSL: 1024, - FIELD_TP: highest_tp, - FIELD_EP: 1, # Default, - FIELD_DP_ATTN: False, # Default - FIELD_CONC: lowest_conc, - FIELD_EXP_NAME: f"{model_code}_test", - FIELD_MAX_MODEL_LEN: 2048, - } - - # Add optional fields if they exist - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - - if len(matrix_values) == 0: - error_msg = f"No configs found matching model prefix '{args.model_prefix}'" - if args.precision: - error_msg += f", precision '{args.precision}'" - if args.framework: - error_msg += f", framework '{args.framework}'" - raise ValueError(error_msg + ".") - - return matrix_values - - -def load_config_files(config_files): - """Load and merge configuration files.""" - all_config_data = {} - for config_file in config_files: - try: - with open(config_file, 'r') as f: - config_data = yaml.safe_load(f) - assert isinstance( - config_data, dict), f"Config file '{config_file}' must contain a dictionary" - - # Check for duplicate keys, this is only in place to prevent against the very unlikely - # case where an entry in one config accidentally/purposefully tries to override an entry in another config - duplicate_keys = set(all_config_data.keys()) & set( - config_data.keys()) - if duplicate_keys: - raise ValueError( - f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" - ) - - all_config_data.update(config_data) - except FileNotFoundError: - raise ValueError(f"Input file '{config_file}' does not exist.") - - return all_config_data - - -def main(): - # Create parent parser with common arguments - parent_parser = argparse.ArgumentParser(add_help=False) - parent_parser.add_argument( - '--config-files', - nargs='+', - required=True, - help='One or more configuration files (YAML format)' - ) - - # Create main parser - parser = argparse.ArgumentParser( - description='Generate benchmark configurations from YAML config files' - ) - - # Create subparsers for subcommands - subparsers = parser.add_subparsers( - dest='command', - required=True, - help='Available commands' - ) - - # Subcommand: full-sweep - full_sweep_parser = subparsers.add_parser( - 'full-sweep', - parents=[parent_parser], - add_help=False, - help='Generate full sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths' - ) - full_sweep_parser.add_argument( - '--model-prefix', - nargs='+', - required=False, - help='Model prefix(es) to filter configurations (optional, can specify multiple)' - ) - full_sweep_parser.add_argument( - '--precision', - nargs='+', - required=False, - help='Precision(s) to filter by (e.g., fp4, fp8) (optional, can specify multiple)' - ) - full_sweep_parser.add_argument( - '--framework', - nargs='+', - required=False, - help='Framework(s) to filter by (e.g., vllm, trt, sglang) (optional, can specify multiple)' - ) - full_sweep_parser.add_argument( - '--runner-type', - nargs='+', - required=False, - help='Runner type(s) to filter by (e.g., h200, h100) (optional, can specify multiple)' - ) - full_sweep_parser.add_argument( - '--runner-config', - required=False, - help='Configuration file holding runner information (required if --runner-type is specified)' - ) - full_sweep_parser.add_argument( - '--seq-lens', - nargs='+', - choices=list(seq_len_stoi.keys()), - required=False, - help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." - ) - full_sweep_parser.add_argument( - '--step-size', - type=int, - default=2, - help='Step size for concurrency values (default: 2)' - ) - full_sweep_parser.add_argument( - '--test-mode', - action='store_true', - help='Test mode: only run highest TP with lowest concurrency for each matching config' - ) - full_sweep_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - # Subcommand: test-config - test_config_parser = subparsers.add_parser( - 'test-config', - parents=[parent_parser], - add_help=False, - help='Given a config key, run that configuration as specified. Optionally specify --test-mode to only run one parallelism-concurrency pair for the config.' - ) - test_config_parser.add_argument( - '--runner-config', - required=True, - help='Configuration file holding runner information' - ) - test_config_parser.add_argument( - '--key', - required=True, - help='Configuration key to use' - ) - test_config_parser.add_argument( - '--runner-node', - required=False, - help='Specific runner node to use' - ) - test_config_parser.add_argument( - '--seq-lens', - nargs='+', - choices=list(seq_len_stoi.keys()), - required=False, - help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." - ) - test_config_parser.add_argument( - '--step-size', - type=int, - default=2, - help='Step size for concurrency values (default: 2)' - ) - test_config_parser.add_argument( - '--test-mode', - action='store_true', - help='Generate only the lowest concurrency value for each TP level' - ) - test_config_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - # Subcommand: runner-model-sweep - test_config_parser = subparsers.add_parser( - 'runner-model-sweep', - parents=[parent_parser], - add_help=False, - help='Given a runner type, find all configurations matching the type, and run that configuration on all individual runner nodes for the specified runner type. This is meant to validate that all runner nodes work on all configurations for a runner type. For instance, to validate that all configs that specify an h200 runner successfully run across all h200 runner nodes.' - ) - test_config_parser.add_argument( - '--runner-type', - required=True, - help='Runner type (e.g., b200-trt, h100)' - ) - test_config_parser.add_argument( - '--runner-config', - required=True, - help='Configuration file holding runner information' - ) - test_config_parser.add_argument( - '--runner-node-filter', - required=False, - help='Filter runner nodes by substring match (e.g., "mi300x-amd" to only include nodes containing that string)' - ) - test_config_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - # Subcommand: runner-sweep - test_config_parser = subparsers.add_parser( - 'runner-sweep', - parents=[parent_parser], - add_help=False, - help='Given a model (and optionally a precision and framework), find all configurations matching the inputs, and run those configurations across all compatible runner nodes. This is meant to validate all runner nodes that should run a particular model can. For instance, this should be used to validate that all runners nodes that should run gptoss-120b actually do so successfully.' - ) - test_config_parser.add_argument( - '--runner-type', - required=True, - help='Runner type (e.g., b200-trt, h100)' - ) - test_config_parser.add_argument( - '--model-prefix', - required=True, - help='Model prefix (e.g., 70b)' - ) - test_config_parser.add_argument( - '--precision', - required=False, - help='Precision to filter by (e.g., fp4) (optional)' - ) - test_config_parser.add_argument( - '--framework', - required=False, - help='Framework to filter by (e.g., trt) (optional)' - ) - test_config_parser.add_argument( - '--runner-config', - required=True, - help='Configuration file holding runner information' - ) - test_config_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - # Subcommand: custom - test_config_parser = subparsers.add_parser( - 'custom', - parents=[parent_parser], - add_help=False, - help='Enter custom values' - ) - test_config_parser.add_argument( - '--runner-label', - required=True, - help='Label associated with runner on which to launch the corresponding job (e.g., h200, h200-nv_1, etc.)' - ) - test_config_parser.add_argument( - '--image', - required=True, - help='Image to run the benchmark (e.g., openai/gpt-oss-120b)' - ) - test_config_parser.add_argument( - '--model', - required=True, - help='Model to run (e.g., vllm/vllm-openai:latest)' - ) - test_config_parser.add_argument( - '--framework', - required=True, - help='Framework to run on (e.g., vllm, trt, sglang)' - ) - test_config_parser.add_argument( - '--precision', - required=True, - help='Precision to run (e.g., fp4, fp8)' - ) - test_config_parser.add_argument( - '--exp-name', - required=True, - help='Experiment name (e.g., 70b_test)' - ) - test_config_parser.add_argument( - '--runner-config', - required=True, - help='Configuration file holding runner information' - ) - test_config_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - args = parser.parse_args() - - # Load and validate configuration files - all_config_data = load_config_files(args.config_files) - validate_master_configs_structure(all_config_data) - - # Route to appropriate function based on subcommand - if args.command == 'full-sweep': - matrix_values = generate_full_sweep(args, all_config_data) - elif args.command == 'test-config': - matrix_values = generate_test_config(args, all_config_data) - elif args.command == 'runner-model-sweep': - matrix_values = generate_runner_model_sweep_config( - args, all_config_data) - elif args.command == 'runner-sweep': - matrix_values = generate_runner_sweep_config( - args, all_config_data) - elif args.command == 'custom': - matrix_values = generate_custom_test(args) - else: - parser.error(f"Unknown command: {args.command}") - - # Validate output before printing - validate_matrix_output(matrix_values) - - print(json.dumps(matrix_values)) - return matrix_values - - -if __name__ == "__main__": - main() diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py deleted file mode 100644 index 4df4a51eb..000000000 --- a/utils/matrix-logic/get_test_sweep_configs.py +++ /dev/null @@ -1,151 +0,0 @@ -import json -import yaml -import sys -import argparse - -seq_len_stoi = { - "1k1k": (1024, 1024), - "1k8k": (1024, 8192), - "8k1k": (8192, 1024) -} - -def main(): - parser = argparse.ArgumentParser( - description='Generate benchmark matrix from a specific configuration key' - ) - parser.add_argument( - '--config-files', - nargs='+', - required=True, - help='One or more configuration files (YAML format)' - ) - parser.add_argument( - '--key', - required=True, - help='Configuration key to use' - ) - parser.add_argument( - '--seq-lens', - nargs='+', - choices=list(seq_len_stoi.keys()), - required=False, - help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." - ) - parser.add_argument( - '--step-size', - type=int, - default=2, - help='Step size for concurrency values (default: 2)' - ) - - args = parser.parse_args() - - # Convert seq-lens to set of (isl, osl) tuples for filtering - seq_lens_filter = None - if args.seq_lens: - seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} - - # Load and merge all config files - all_config_data = {} - for config_file in args.config_files: - try: - with open(config_file, 'r') as f: - config_data = yaml.safe_load(f) - assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary" - - # Check for duplicate keys - duplicate_keys = set(all_config_data.keys()) & set(config_data.keys()) - if duplicate_keys: - raise ValueError( - f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" - ) - - all_config_data.update(config_data) - except FileNotFoundError: - raise ValueError(f"Input file '{config_file}' does not exist.") - - # Check if the key exists - if args.key not in all_config_data: - available_keys = ', '.join(sorted(all_config_data.keys())) - raise ValueError( - f"Key '{args.key}' not found in configuration files. " - f"Available keys: {available_keys}" - ) - - val = all_config_data[args.key] - - # Validate required fields - seq_len_configs = val.get('seq-len-configs') - assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'" - - image = val.get('image') - model = val.get('model') - precision = val.get('precision') - framework = val.get('framework') - runner = val.get('runner') - - assert None not in (image, model, precision, framework, runner), \ - f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'" - - matrix_values = [] - - # Process each sequence length configuration - for seq_config in seq_len_configs: - isl = seq_config.get('isl') - osl = seq_config.get('osl') - - assert None not in (isl, osl), \ - f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'" - - # Filter by sequence lengths if specified - if seq_lens_filter and (isl, osl) not in seq_lens_filter: - continue - - bmk_space = seq_config.get('bmk-space') - assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'" - - for bmk in bmk_space: - tp = bmk.get('tp') - conc_start = bmk.get('conc-start') - conc_end = bmk.get('conc-end') - ep = bmk.get('ep') - dp_attn = bmk.get('dp-attn') - - assert None not in (tp, conc_start, conc_end), \ - f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'" - - # Generate entries for each concurrency value in the range - conc = conc_start - while conc <= conc_end: - entry = { - 'image': image, - 'model': model, - 'precision': precision, - 'framework': framework, - 'runner': runner, - 'isl': isl, - 'osl': osl, - 'tp': tp, - 'conc': conc, - 'max-model-len': isl + osl, - } - - # Add optional fields if they exist - if ep is not None: - entry['ep'] = ep - if dp_attn is not None: - entry['dp-attn'] = dp_attn - - matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end - - print(json.dumps(matrix_values)) - return matrix_values - -if __name__ == "__main__": - main() diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py deleted file mode 100644 index c184ecbab..000000000 --- a/utils/matrix-logic/test_generate_sweep_configs.py +++ /dev/null @@ -1,1656 +0,0 @@ -import pytest -import yaml -from unittest.mock import patch -from generate_sweep_configs import ( - validate_master_configs_structure, - validate_matrix_output, - seq_len_to_str, - generate_full_sweep, - generate_test_config, - generate_runner_model_sweep_config, - generate_runner_sweep_config, - generate_custom_test, - load_config_files, - main, - MatrixEntry, -) - - -# Fixtures for test config files -@pytest.fixture -def sample_master_config(): - """Sample master config with valid entries.""" - return { - "70b-fp8-vllm": { - "image": "vllm/vllm-openai:v0.10.2", - "model": "meta-llama/Llama-3-70b", - "model-prefix": "70b", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 1, "conc-end": 4}, - {"tp": 8, "conc-start": 2, "conc-end": 8, "ep": 2, "dp-attn": True} - ] - }, - { - "isl": 1024, - "osl": 8192, - "search-space": [ - {"tp": 8, "conc-start": 1, "conc-end": 2} - ] - } - ] - }, - "8b-fp4-trt": { - "image": "nvcr.io/nvidia/tritonserver:24.01", - "model": "meta-llama/Llama-3-8b", - "model-prefix": "8b", - "precision": "fp4", - "framework": "trt", - "runner": "h100", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 2, "conc-start": 4, "conc-end": 16} - ] - } - ] - }, - "gptoss-120b-fp8-vllm": { - "image": "vllm/vllm-openai:latest", - "model": "openai/gpt-oss-120b", - "model-prefix": "gptoss", - "precision": "fp8", - "framework": "vllm", - "runner": "h200-trt", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 8, "conc-start": 1, "conc-end": 4} - ] - } - ] - } - } - - -@pytest.fixture -def sample_runner_config(): - """Sample runner config.""" - return { - "h200": ["h200-nv_1", "h200-nv_2"], - "h100": ["h100-aws_1"], - "h200-trt": ["h200-trt_1", "h200-trt_2", "h200-trt_3"] - } - - -@pytest.fixture -def temp_config_files(tmp_path, sample_master_config, sample_runner_config): - """Create temporary config files.""" - master_file = tmp_path / "master.yaml" - runner_file = tmp_path / "runners.yaml" - - with open(master_file, 'w') as f: - yaml.dump(sample_master_config, f) - - with open(runner_file, 'w') as f: - yaml.dump(sample_runner_config, f) - - return str(master_file), str(runner_file) - - -@pytest.fixture -def invalid_master_config(): - """Master config with validation errors.""" - return { - "missing-field": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - # Missing precision, framework, runner, seq-len-configs - } - } - - -# Tests for seq_len_to_str -def test_seq_len_to_str_with_mapping(): - """Test seq_len_to_str with known mappings.""" - assert seq_len_to_str(1024, 1024) == "1k1k" - assert seq_len_to_str(1024, 8192) == "1k8k" - assert seq_len_to_str(8192, 1024) == "8k1k" - - -def test_seq_len_to_str_without_mapping(): - """Test seq_len_to_str fallback for unknown mappings.""" - assert seq_len_to_str(2048, 4096) == "2048_4096" - assert seq_len_to_str(512, 512) == "512_512" - - -# Tests for MatrixEntry validation -def test_matrix_entry_valid(): - """Test valid MatrixEntry.""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp" - } - result = MatrixEntry(**entry) - assert result.image == "test:latest" - assert result.tp == 8 - - -def test_matrix_entry_missing_field(): - """Test MatrixEntry with missing required field.""" - entry = { - "image": "test:latest", - "model": "test/model", - # Missing other required fields - } - with pytest.raises(Exception): # Pydantic ValidationError - MatrixEntry(**entry) - - -def test_matrix_entry_wrong_type(): - """Test MatrixEntry with wrong type.""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": "not_an_int", # Wrong type - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp" - } - with pytest.raises(Exception): # Pydantic ValidationError - MatrixEntry(**entry) - - -def test_matrix_entry_extra_field(): - """Test MatrixEntry with extra field (should be forbidden).""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp", - "extra-field": "should_fail" - } - with pytest.raises(Exception): # Pydantic ValidationError - MatrixEntry(**entry) - - -# Tests for validate_matrix_output -def test_validate_matrix_output_valid(): - """Test validate_matrix_output with valid entries.""" - entries = [ - { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp" - } - ] - result = validate_matrix_output(entries) - assert result == entries - - -def test_validate_matrix_output_invalid(): - """Test validate_matrix_output with invalid entry.""" - entries = [ - { - "image": "test:latest", - "model": "test/model", - # Missing required fields - } - ] - with pytest.raises(ValueError, match="Matrix entry at index 0 failed validation"): - validate_matrix_output(entries) - - -def test_validate_matrix_output_multiple_entries(): - """Test validate_matrix_output with multiple entries.""" - entries = [ - { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp" - }, - { - "image": "test2:latest", - "model": "test2/model", - "precision": "fp4", - "framework": "trt", - "runner": "h100", - "isl": 1024, - "osl": 1024, - "tp": 4, - "ep": 2, - "dp-attn": True, - "conc": 8, - "max-model-len": 2048, - "exp-name": "test_exp2" - } - ] - result = validate_matrix_output(entries) - assert len(result) == 2 - - -# Tests for validate_master_configs_structure -def test_validate_master_configs_structure_valid(sample_master_config): - """Test validation of valid master config.""" - validate_master_configs_structure(sample_master_config) - - -def test_validate_master_configs_structure_missing_field(): - """Test validation with missing required field.""" - config = { - "test-key": { - "image": "test:latest", - "model-prefix": "test", - # Missing other required fields - } - } - with pytest.raises(ValueError, match="Missing required field"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_type(): - """Test validation with wrong field type.""" - config = { - "test-key": { - "image": 123, # Should be string - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [] - } - } - with pytest.raises(ValueError, match="must be str"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_empty_seq_len_configs(): - """Test validation with empty seq-len-configs.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [] - } - } - with pytest.raises(ValueError, match="must be a non-empty list"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_invalid_search_space(): - """Test validation with invalid search-space.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 8} # Missing conc-start and conc-end - ] - } - ] - } - } - with pytest.raises(ValueError, match="Missing 'conc-start'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_missing_search_space(): - """Test validation with missing search-space.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024 - # Missing search-space - } - ] - } - } - with pytest.raises(ValueError, match="Missing or invalid 'search-space'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_search_space_not_list(): - """Test validation with search-space not being a list.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": "not_a_list" - } - ] - } - } - with pytest.raises(ValueError, match="Missing or invalid 'search-space'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_extra_fields_in_search_space(): - """Test validation with extra fields in search-space.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - { - "tp": 8, - "conc-start": 1, - "conc-end": 4, - "invalid-field": "value" - } - ] - } - ] - } - } - with pytest.raises(ValueError, match="Extra fields"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_missing_isl(): - """Test validation with missing isl.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="Missing 'isl'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_isl_type(): - """Test validation with wrong isl type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": "not_int", - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="'isl' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_missing_osl(): - """Test validation with missing osl.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="Missing 'osl'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_osl_type(): - """Test validation with wrong osl type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": "not_int", - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="'osl' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_tp_type(): - """Test validation with wrong tp type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": "not_int", "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="'tp' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_conc_start_type(): - """Test validation with wrong conc-start type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": "not_int", "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="'conc-start' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_conc_end_type(): - """Test validation with wrong conc-end type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": "not_int"}] - } - ] - } - } - with pytest.raises(ValueError, match="'conc-end' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_ep_type(): - """Test validation with wrong ep type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4, "ep": "not_int"}] - } - ] - } - } - with pytest.raises(ValueError, match="'ep' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_dp_attn_type(): - """Test validation with wrong dp-attn type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4, "dp-attn": "not_bool"}] - } - ] - } - } - with pytest.raises(ValueError, match="'dp-attn' must be bool"): - validate_master_configs_structure(config) - - -# Tests for load_config_files -def test_load_config_files_valid(temp_config_files): - """Test loading valid config files.""" - master_file, _ = temp_config_files - result = load_config_files([master_file]) - assert len(result) == 3 - assert "70b-fp8-vllm" in result - - -def test_load_config_files_multiple(tmp_path, sample_master_config): - """Test loading multiple config files.""" - file1 = tmp_path / "config1.yaml" - file2 = tmp_path / "config2.yaml" - - config1 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]} - config2 = {"8b-fp4-trt": sample_master_config["8b-fp4-trt"]} - - with open(file1, 'w') as f: - yaml.dump(config1, f) - with open(file2, 'w') as f: - yaml.dump(config2, f) - - result = load_config_files([str(file1), str(file2)]) - assert len(result) == 2 - - -def test_load_config_files_not_found(): - """Test loading non-existent config file.""" - with pytest.raises(ValueError, match="does not exist"): - load_config_files(["/nonexistent/file.yaml"]) - - -def test_load_config_files_duplicate_keys(tmp_path, sample_master_config): - """Test loading files with duplicate keys.""" - file1 = tmp_path / "config1.yaml" - file2 = tmp_path / "config2.yaml" - - config1 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]} - config2 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]} # Duplicate - - with open(file1, 'w') as f: - yaml.dump(config1, f) - with open(file2, 'w') as f: - yaml.dump(config2, f) - - with pytest.raises(ValueError, match="Duplicate configuration keys"): - load_config_files([str(file1), str(file2)]) - - -# Tests for generate_full_sweep -def test_generate_full_sweep_basic(sample_master_config, temp_config_files): - """Test basic full sweep generation.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert len(result) > 0 - assert all(entry['exp-name'].startswith('70b_1k1k') for entry in result) - assert all(entry['isl'] == 1024 and entry['osl'] == 1024 for entry in result) - - -def test_generate_full_sweep_with_optionals(sample_master_config, temp_config_files): - """Test full sweep with optional ep and dp-attn.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # Find entry with tp=8 which should have ep=2 and dp-attn=True - tp8_entries = [e for e in result if e['tp'] == 8] - assert len(tp8_entries) > 0 - assert all(e['ep'] == 2 for e in tp8_entries) - assert all(e['dp-attn'] == True for e in tp8_entries) - - -def test_generate_full_sweep_no_matches(sample_master_config, temp_config_files): - """Test full sweep with no matching configs.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["nonexistent"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - with pytest.raises(ValueError, match="No configs found matching filters"): - generate_full_sweep(Args(), sample_master_config) - - -def test_generate_full_sweep_different_seq_len(sample_master_config, temp_config_files): - """Test full sweep with different sequence length.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - seq_lens = ["1k8k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert len(result) > 0 - assert all(entry['isl'] == 1024 and entry['osl'] == 8192 for entry in result) - - -def test_generate_full_sweep_step_size(sample_master_config, temp_config_files): - """Test full sweep with different step size.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["8b"] - seq_lens = ["1k1k"] - step_size = 4 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # Should have entries at conc=4, 8, 16 (step_size=4, conc-start=4, conc-end=16) - conc_values = sorted(set(e['conc'] for e in result)) - assert 4 in conc_values - assert 16 in conc_values - - -def test_generate_full_sweep_seq_len_not_in_config(temp_config_files): - """Test full sweep when requested seq-len is not in config.""" - _, runner_file = temp_config_files - - config = { - "test-fp8-vllm": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 8192, - "osl": 1024, # Only has 8k1k, not 1k1k - "search-space": [ - {"tp": 4, "conc-start": 1, "conc-end": 4} - ] - } - ] - } - } - - class Args: - model_prefix = ["test"] - seq_lens = ["1k1k"] # Requesting 1k1k but config only has 8k1k - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - # Should raise error since no matching seq-len - with pytest.raises(ValueError, match="No configs found matching filters"): - generate_full_sweep(Args(), config) - - -def test_generate_full_sweep_concurrency_overshoot(temp_config_files): - """Test full sweep when concurrency step overshoots end value.""" - _, runner_file = temp_config_files - - config = { - "test-fp8-vllm": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 1, "conc-end": 5} # 1, 3*2=6 overshoots, clamps to 5 - ] - } - ] - } - } - - class Args: - model_prefix = ["test"] - seq_lens = ["1k1k"] - step_size = 3 # Will overshoot: 1, 3, 9 (clamped to 5) - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), config) - conc_values = sorted(set(e['conc'] for e in result)) - # Should have 1, 3, 5 (5 is the clamped value) - assert conc_values == [1, 3, 5] - - -# Tests for generate_full_sweep with filters -def test_generate_full_sweep_no_filters(sample_master_config, temp_config_files): - """Test filtered sweep with no filters.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = None - precision = None - framework = None - runner_type = None - seq_lens = None - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert len(result) > 0 - - -def test_generate_full_sweep_with_filters_model_prefix(sample_master_config, temp_config_files): - """Test filtered sweep with model prefix filter.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - precision = None - framework = None - runner_type = None - seq_lens = None - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert all("70b" in entry['exp-name'] for entry in result) - - -def test_generate_full_sweep_with_filters_multiple_filters(sample_master_config, temp_config_files): - """Test filtered sweep with multiple filters.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - precision = ["fp8"] - framework = ["vllm"] - runner_type = None - seq_lens = ["1k1k"] - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert len(result) > 0 - assert all(entry['precision'] == 'fp8' for entry in result) - assert all(entry['framework'] == 'vllm' for entry in result) - - -def test_generate_full_sweep_with_filters_test_mode(sample_master_config, temp_config_files): - """Test filtered sweep in test mode.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - precision = None - framework = None - runner_type = None - seq_lens = ["1k1k"] - step_size = 2 - test_mode = True - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # In test mode, should only get one entry per seq-len (highest TP, lowest conc) - assert len(result) == 1 # Only one config matches 70b with 1k1k - assert result[0]['tp'] == 8 # Highest TP - assert '70b_1k1k' in result[0]['exp-name'] - - -def test_generate_full_sweep_with_filters_runner_type_validation(sample_master_config, temp_config_files): - """Test filtered sweep with invalid runner type.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = None - precision = None - framework = None - runner_type = ["invalid-runner"] - seq_lens = None - step_size = 2 - test_mode = False - runner_config = runner_file - - with pytest.raises(ValueError, match="Invalid runner type"): - generate_full_sweep(Args(), sample_master_config) - - -def test_generate_full_sweep_with_filters_runner_type_no_config(sample_master_config): - """Test filtered sweep with runner type but no config file.""" - class Args: - model_prefix = None - precision = None - framework = None - runner_type = ["h200"] - seq_lens = None - step_size = 2 - test_mode = False - runner_config = None - - with pytest.raises(ValueError, match="runner-config is required"): - generate_full_sweep(Args(), sample_master_config) - - -def test_generate_full_sweep_with_filters_multiple_runner_types(sample_master_config, temp_config_files): - """Test filtered sweep with multiple runner types.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = None - precision = None - framework = None - runner_type = ["h200", "h100"] - seq_lens = ["1k1k"] - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - runners = set(entry['runner'] for entry in result) - assert 'h200' in runners or 'h100' in runners - - -def test_generate_full_sweep_with_filters_no_matches(sample_master_config, temp_config_files): - """Test filtered sweep with no matching configs.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["nonexistent"] - precision = None - framework = None - runner_type = None - seq_lens = None - step_size = 2 - test_mode = False - runner_config = runner_file - - with pytest.raises(ValueError, match="No configs found matching filters"): - generate_full_sweep(Args(), sample_master_config) - - -def test_generate_full_sweep_with_filters_concurrency_overshoot(temp_config_files): - """Test filtered sweep when concurrency step overshoots end value.""" - _, runner_file = temp_config_files - - config = { - "test-fp8-vllm": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 2, "conc-end": 7} # 2, 8 overshoots, clamps to 7 - ] - } - ] - } - } - - class Args: - model_prefix = None - precision = None - framework = None - runner_type = None - seq_lens = None - step_size = 4 # Will overshoot: 2, 8 (clamped to 7) - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), config) - conc_values = sorted(set(e['conc'] for e in result)) - # Should have 2, 7 (7 is the clamped value) - assert 2 in conc_values - assert 7 in conc_values - - -# Tests for generate_test_config -def test_generate_test_config_basic(sample_master_config, temp_config_files): - """Test basic test config generation.""" - _, runner_file = temp_config_files - - class Args: - key = "70b-fp8-vllm" - runner_config = runner_file - runner_node = "h200-nv_1" - seq_lens = None - step_size = 2 - test_mode = False - - result = generate_test_config(Args(), sample_master_config) - assert len(result) > 0 - - -def test_generate_test_config_test_mode(sample_master_config, temp_config_files): - """Test test config in test mode.""" - _, runner_file = temp_config_files - - class Args: - key = "70b-fp8-vllm" - runner_config = runner_file - runner_node = "h200-nv_1" - seq_lens = ["1k1k"] - step_size = 2 - test_mode = True - - result = generate_test_config(Args(), sample_master_config) - # In test mode, should only use lowest concurrency - assert all(entry['conc'] == 1 or entry['conc'] == 2 for entry in result) - - -def test_generate_test_config_specific_runner_node(sample_master_config, temp_config_files): - """Test test config with specific runner node.""" - _, runner_file = temp_config_files - - class Args: - key = "70b-fp8-vllm" - runner_config = runner_file - runner_node = "h200-nv_1" - seq_lens = None - step_size = 2 - test_mode = False - - result = generate_test_config(Args(), sample_master_config) - assert all(entry['runner'] == 'h200-nv_1' for entry in result) - - -def test_generate_test_config_invalid_key(sample_master_config, temp_config_files): - """Test test config with invalid key.""" - _, runner_file = temp_config_files - - class Args: - key = "nonexistent-key" - runner_config = runner_file - runner_node = None - seq_lens = None - step_size = 2 - test_mode = False - - with pytest.raises(ValueError, match="does not exist in config files"): - generate_test_config(Args(), sample_master_config) - - -def test_generate_test_config_invalid_runner_node(sample_master_config, temp_config_files): - """Test test config with invalid runner node.""" - _, runner_file = temp_config_files - - class Args: - key = "70b-fp8-vllm" - runner_config = runner_file - runner_node = "invalid-node" - seq_lens = None - step_size = 2 - test_mode = False - - with pytest.raises(ValueError, match="is not compatible"): - generate_test_config(Args(), sample_master_config) - - -def test_generate_test_config_missing_runner_config(sample_master_config): - """Test test config with missing runner config file.""" - class Args: - key = "70b-fp8-vllm" - runner_config = "/nonexistent/file.yaml" - runner_node = None - seq_lens = None - step_size = 2 - test_mode = False - - with pytest.raises(ValueError, match="does not exist"): - generate_test_config(Args(), sample_master_config) - - -def test_generate_test_config_concurrency_overshoot(temp_config_files): - """Test test config when concurrency step overshoots end value.""" - _, runner_file = temp_config_files - - config = { - "test-fp8-vllm": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 1, "conc-end": 6} - ] - } - ] - } - } - - class Args: - key = "test-fp8-vllm" - runner_config = runner_file - runner_node = "h200-nv_1" - seq_lens = None - step_size = 4 # Will overshoot: 1, 4, 16 (clamped to 6) - test_mode = False - - result = generate_test_config(Args(), config) - conc_values = sorted(set(e['conc'] for e in result)) - assert 1 in conc_values - assert 4 in conc_values - assert 6 in conc_values - - -# Tests for generate_runner_model_sweep_config -def test_generate_runner_model_sweep_config(sample_master_config, temp_config_files): - """Test runner-model sweep config generation.""" - _, runner_file = temp_config_files - - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = None - - result = generate_runner_model_sweep_config(Args(), sample_master_config) - assert len(result) > 0 - # Should have entries for each runner node under h200 - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' in runners - - -def test_generate_runner_model_sweep_config_invalid_runner(sample_master_config, temp_config_files): - """Test runner-model sweep with invalid runner type.""" - _, runner_file = temp_config_files - - class Args: - runner_type = "invalid-runner" - runner_config = runner_file - runner_node_filter = None - - with pytest.raises(ValueError, match="does not exist in runner config"): - generate_runner_model_sweep_config(Args(), sample_master_config) - - -def test_generate_runner_model_sweep_config_with_node_filter(sample_master_config, temp_config_files): - """Test runner-model sweep with runner node filter.""" - _, runner_file = temp_config_files - - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = "nv_1" - - result = generate_runner_model_sweep_config(Args(), sample_master_config) - # Should only have entries for h200-nv_1 - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' not in runners - - -def test_generate_runner_model_sweep_config_with_node_filter_multiple_matches(sample_master_config, temp_config_files): - """Test runner-model sweep with runner node filter matching multiple nodes.""" - _, runner_file = temp_config_files - - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = "nv" # Should match both nv_1 and nv_2 - - result = generate_runner_model_sweep_config(Args(), sample_master_config) - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' in runners - - -def test_generate_runner_model_sweep_config_with_node_filter_no_matches(sample_master_config, temp_config_files): - """Test runner-model sweep with runner node filter that matches no nodes.""" - _, runner_file = temp_config_files - - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = "nonexistent" - - with pytest.raises(ValueError, match="No runner nodes found matching filter"): - generate_runner_model_sweep_config(Args(), sample_master_config) - - -def test_generate_runner_model_sweep_config_without_node_filter(sample_master_config, temp_config_files): - """Test runner-model sweep without runner node filter (default behavior).""" - _, runner_file = temp_config_files - - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = None - - result = generate_runner_model_sweep_config(Args(), sample_master_config) - # Should have entries for all h200 nodes - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' in runners - - -# Tests for generate_runner_sweep_config -def test_generate_runner_sweep_config(sample_master_config, temp_config_files): - """Test runner sweep config generation.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = "70b" - runner_type = "h200" - precision = None - framework = None - runner_config = runner_file - - result = generate_runner_sweep_config(Args(), sample_master_config) - assert len(result) > 0 - - -def test_generate_runner_sweep_config_with_filters(sample_master_config, temp_config_files): - """Test runner sweep with precision and framework filters.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = "70b" - runner_type = "h200" - precision = "fp8" - framework = "vllm" - runner_config = runner_file - - result = generate_runner_sweep_config(Args(), sample_master_config) - assert all(entry['precision'] == 'fp8' for entry in result) - assert all(entry['framework'] == 'vllm' for entry in result) - - -def test_generate_runner_sweep_config_no_matches(sample_master_config, temp_config_files): - """Test runner sweep with no matching configs.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = "nonexistent" - runner_type = "h200" - precision = None - framework = None - runner_config = runner_file - - with pytest.raises(ValueError, match="No configs found matching"): - generate_runner_sweep_config(Args(), sample_master_config) - - -# Tests for generate_custom_test -def test_generate_custom_test(temp_config_files): - """Test custom test generation.""" - _, runner_file = temp_config_files - - class Args: - runner_label = "h200" - image = "vllm/vllm-openai:latest" - model = "test/model" - framework = "vllm" - precision = "fp8" - exp_name = "custom_test" - runner_config = runner_file - - result = generate_custom_test(Args()) - assert len(result) == 1 - assert result[0]['image'] == "vllm/vllm-openai:latest" - assert result[0]['exp-name'] == "custom_test" - - -def test_generate_custom_test_invalid_runner(temp_config_files): - """Test custom test with invalid runner label.""" - _, runner_file = temp_config_files - - class Args: - runner_label = "invalid-runner" - image = "vllm/vllm-openai:latest" - model = "test/model" - framework = "vllm" - precision = "fp8" - exp_name = "custom_test" - runner_config = runner_file - - with pytest.raises(ValueError, match="Unable to find specified runner label"): - generate_custom_test(Args()) - - -# Tests for main function -def test_main_full_sweep(temp_config_files): - """Test main function with full-sweep command.""" - master_file, _ = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", master_file, - "--seq-lens", "1k1k", - "--model-prefix", "70b", - "--step-size", "2" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - -def test_main_full_sweep_with_filters(temp_config_files): - """Test main function with full-sweep command with filters.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--model-prefix", "70b", - "--precision", "fp8", - "--test-mode" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - -def test_main_test_config(temp_config_files): - """Test main function with test-config command.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "test-config", - "--config-files", master_file, - "--runner-config", runner_file, - "--key", "70b-fp8-vllm", - "--runner-node", "h200-nv_1", - "--test-mode" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - -def test_main_runner_model_sweep(temp_config_files): - """Test main function with runner-model-sweep command.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "runner-model-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-type", "h200" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - -def test_main_runner_model_sweep_with_node_filter(temp_config_files): - """Test main function with runner-model-sweep command with node filter.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "runner-model-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-type", "h200", - "--runner-node-filter", "nv_1" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' not in runners - - -def test_main_runner_sweep(temp_config_files): - """Test main function with runner-sweep command.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "runner-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-type", "h200", - "--model-prefix", "70b" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - -def test_main_custom(temp_config_files): - """Test main function with custom command.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "custom", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-label", "h200", - "--image", "test:latest", - "--model", "test/model", - "--framework", "vllm", - "--precision", "fp8", - "--exp-name", "custom_test" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) == 1 - - -def test_main_invalid_config_structure(tmp_path): - """Test main with invalid config structure.""" - invalid_file = tmp_path / "invalid.yaml" - with open(invalid_file, 'w') as f: - yaml.dump({"key": {"image": "test"}}, f) # Missing required fields - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", str(invalid_file), - "--seq-lens", "1k1k", - "--model-prefix", "test" - ] - - with patch('sys.argv', test_args): - with pytest.raises(ValueError): - main() - - -def test_main_validation_failure(temp_config_files, monkeypatch): - """Test main with validation failure on output.""" - master_file, _ = temp_config_files - - # Monkey patch validate_matrix_output to always fail - def mock_validate(entries): - raise ValueError("Validation failed") - - monkeypatch.setattr('generate_sweep_configs.validate_matrix_output', mock_validate) - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", master_file, - "--seq-lens", "1k1k", - "--model-prefix", "70b" - ] - - with patch('sys.argv', test_args): - with pytest.raises(ValueError, match="Validation failed"): - main() - - -# Edge case tests -def test_concurrency_step_reaches_exact_end(sample_master_config, temp_config_files): - """Test that concurrency stepping reaches exact end value.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["8b"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # conc-start=4, conc-end=16, step=2 should give 4,8,16 - conc_values = sorted(set(e['conc'] for e in result)) - assert 16 in conc_values - - -def test_multiple_model_prefixes_filtered_sweep(sample_master_config, temp_config_files): - """Test filtered sweep with multiple model prefixes.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b", "8b"] - precision = None - framework = None - runner_type = None - seq_lens = ["1k1k"] - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - exp_names = [e['exp-name'] for e in result] - assert any('70b' in name for name in exp_names) - assert any('8b' in name for name in exp_names) - - -def test_seq_len_filter_multiple(sample_master_config, temp_config_files): - """Test filtering with multiple sequence lengths.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - precision = None - framework = None - runner_type = None - seq_lens = ["1k1k", "1k8k"] - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - seq_lens = set((e['isl'], e['osl']) for e in result) - assert (1024, 1024) in seq_lens - assert (1024, 8192) in seq_lens - - -def test_default_ep_dp_attn_values(sample_master_config, temp_config_files): - """Test that default ep and dp-attn values are set correctly.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["8b"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # 8b config doesn't specify ep/dp-attn, so should use defaults - assert all(e['ep'] == 1 for e in result) - assert all(e['dp-attn'] == False for e in result) - - -def test_max_model_len_calculation(sample_master_config, temp_config_files): - """Test that max-model-len is calculated correctly.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - seq_lens = ["1k8k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # isl=1024, osl=8192, so max-model-len should be 1024+8192+200=9416 - assert all(e['max-model-len'] == 9416 for e in result) - - -if __name__ == "__main__": - pytest.main([__file__, "-v", "--cov=generate_sweep_configs", "--cov-report=term-missing"]) diff --git a/utils/matrix_logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py new file mode 100644 index 000000000..8fc47651c --- /dev/null +++ b/utils/matrix_logic/generate_sweep_configs.py @@ -0,0 +1,570 @@ +import json +import yaml +import argparse + +from validation import validate_master_config, validate_matrix_entry, validate_runner_config, Fields + +seq_len_stoi = { + "1k1k": (1024, 1024), + "1k8k": (1024, 8192), + "8k1k": (8192, 1024) +} + +# Reverse mapping for exp-name generation +seq_len_itos = {v: k for k, v in seq_len_stoi.items()} + + +def seq_len_to_str(isl: int, osl: int) -> str: + """Convert sequence lengths to short string representation. + + Returns the short name (e.g., '1k1k') if it exists in the mapping, + otherwise returns 'isl_osl' format. + """ + return seq_len_itos.get((isl, osl), f"{isl}_{osl}") + + +def generate_full_sweep(args, all_config_data, runner_data): + """Generate full sweep configurations with optional filtering. + + Supports filtering by model prefix, precision, framework, runner type, sequence lengths, + and max concurrency. + + All filters are optional - can generate sweeps for all configs or filter by specific criteria. + + Assumes all_config_data has been validated by validate_master_config(). + """ + # Validate runner types if specified + if args.runner_type: + valid_runner_types = set(runner_data.keys()) + invalid_runners = set(args.runner_type) - valid_runner_types + if invalid_runners: + raise ValueError( + f"Invalid runner type(s): {invalid_runners}. " + f"Valid runner types are: {', '.join(sorted(valid_runner_types))}") + + matrix_values = [] + + # Convert seq-lens to set of (isl, osl) tuples for filtering + seq_lens_filter = None + if args.seq_lens: + seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} + + # Iterate through all configurations and apply filters as specified (this is just "selecting" + # configs from all of the master configs subject to some pattern matching) + for key, val in all_config_data.items(): + # Filter by model prefix if specified + if args.model_prefix: + if not any(key.startswith(prefix) for prefix in args.model_prefix): + continue + + # Filter by precision if specified + if args.precision and val[Fields.PRECISION.value] not in args.precision: + continue + + # Filter by framework if specified + if args.framework and val[Fields.FRAMEWORK.value] not in args.framework: + continue + + # Filter by runner type if specified + if args.runner_type and val[Fields.RUNNER.value] not in args.runner_type: + continue + + # Check if this is a multinode config + is_multinode = val.get(Fields.MULTINODE.value, False) + # Get disagg value, defaulting to False if not specified + disagg = val.get(Fields.DISAGG.value, False) + + seq_len_configs = val[Fields.SEQ_LEN_CONFIGS.value] + image = val[Fields.IMAGE.value] + model = val[Fields.MODEL.value] + precision = val[Fields.PRECISION.value] + framework = val[Fields.FRAMEWORK.value] + runner = val[Fields.RUNNER.value] + model_code = val[Fields.MODEL_PREFIX.value] + + for seq_config in seq_len_configs: + isl = seq_config[Fields.ISL.value] + osl = seq_config[Fields.OSL.value] + + # Filter by sequence lengths if specified + if seq_lens_filter and (isl, osl) not in seq_lens_filter: + continue + + bmk_space = seq_config[Fields.SEARCH_SPACE.value] + + for bmk in bmk_space: + if is_multinode: + # Skip multinode configs when --single-node is specified + if not args.multi_node: + continue + + # Multinode configuration + # spec_decoding defaults to "none" if not specified + spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none") + + prefill = bmk[Fields.PREFILL.value] + decode = bmk[Fields.DECODE.value] + + # Get concurrency values (can be list or range) + conc_list = bmk.get(Fields.CONC_LIST.value) + # If it's a list + if conc_list: + conc_values = conc_list + # If it's a range + else: + conc_start = bmk[Fields.CONC_START.value] + conc_end = bmk[Fields.CONC_END.value] + conc_values = [] + conc = conc_start + while conc <= conc_end: + conc_values.append(conc) + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end + + # Apply max-conc filter if specified + # If max_conc is less than all values, use max_conc directly (if valid) + if args.max_conc is not None: + filtered_conc = [c for c in conc_values if c <= args.max_conc] + if not filtered_conc: + # No existing values <= max_conc, so use max_conc directly if valid + if args.max_conc > 0: + conc_values = [args.max_conc] + else: + continue # Skip if max_conc is not positive + else: + conc_values = filtered_conc + + # For multinode, create a single entry with conc as a list + seq_len_str = seq_len_to_str(isl, osl) + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.PREFILL.value: prefill, + Fields.DECODE.value: decode, + Fields.CONC.value: conc_values, # Pass the entire list for multinode + Fields.MAX_MODEL_LEN.value: isl + osl + 200, + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + Fields.DISAGG.value: disagg, + } + + validate_matrix_entry(entry, is_multinode) + matrix_values.append(entry) + elif args.single_node: + # Single-node configuration + tp = bmk[Fields.TP.value] + conc_start = bmk[Fields.CONC_START.value] + conc_end = bmk[Fields.CONC_END.value] + ep = bmk.get(Fields.EP.value) + dp_attn = bmk.get(Fields.DP_ATTN.value) + spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none") + + # Apply max-tp filter if specified + # If tp > max_tp, use max_tp instead of skipping (if valid) + if args.max_tp is not None: + if args.max_tp <= 0: + continue # Skip if max_tp is not positive + if tp > args.max_tp: + tp = args.max_tp + + # Apply max-ep filter if specified + # If ep > max_ep, use max_ep instead of skipping (if valid) + if args.max_ep is not None: + if args.max_ep <= 0: + continue # Skip if max_ep is not positive + if ep is not None and ep > args.max_ep: + ep = args.max_ep + + # Apply max-conc filter if specified + # If conc_start > max_conc, use max_conc as both start and end (if valid) + if args.max_conc is not None: + if args.max_conc <= 0: + continue # Skip if max_conc is not positive + if conc_start > args.max_conc: + conc_start = args.max_conc + conc_end = args.max_conc + else: + conc_end = min(conc_end, args.max_conc) + + conc = conc_start + while conc <= conc_end: + seq_len_str = seq_len_to_str(isl, osl) + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.TP.value: tp, + Fields.CONC.value: conc, + Fields.MAX_MODEL_LEN.value: isl + osl + 200, + Fields.EP.value: 1, # Default + Fields.DP_ATTN.value: False, # Default + Fields.SPEC_DECODING.value: spec_decoding, + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + Fields.DISAGG.value: disagg, + } + + if ep is not None: + entry[Fields.EP.value] = ep + if dp_attn is not None: + entry[Fields.DP_ATTN.value] = dp_attn + + validate_matrix_entry(entry, is_multinode) + matrix_values.append(entry) + + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end + + return matrix_values + + +def generate_runner_model_sweep_config(args, all_config_data, runner_data): + """Generate runner-model sweep configurations. + + Assumes all_config_data has been validated by validate_config_structure(). + Supports both single-node and multinode configurations. + """ + runner_nodes = runner_data.get(args.runner_type) + + if not runner_nodes: + raise ValueError( + f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_data.keys())}'.") + + # Filter runner nodes if filter is specified + if args.runner_node_filter: + runner_nodes = [ + node for node in runner_nodes if args.runner_node_filter in node] + if not runner_nodes: + raise ValueError( + f"No runner nodes found matching filter '{args.runner_node_filter}' for runner type '{args.runner_type}'.") + + matrix_values = [] + for key, val in all_config_data.items(): + # Only consider configs with specified runner + if val[Fields.RUNNER.value] != args.runner_type: + continue + + is_multinode = val.get(Fields.MULTINODE.value, False) + + # Skip configs that don't match the requested node type + if args.single_node and is_multinode: + continue + if args.multi_node and not is_multinode: + continue + + # Get model code for exp_name + model_code = val[Fields.MODEL_PREFIX.value] + # Get disagg value, defaulting to False if not specified + disagg = val.get(Fields.DISAGG.value, False) + + # Find 1k1k config + target_config = None + for config in val[Fields.SEQ_LEN_CONFIGS.value]: + if config[Fields.ISL.value] == 1024 and config[Fields.OSL.value] == 1024: + target_config = config + break + + if target_config is None: + continue + + if is_multinode: + # For multinode, find the search space entry with the lowest concurrency + def get_lowest_conc(search_space_entry): + conc_list = search_space_entry.get(Fields.CONC_LIST.value, []) + return min(conc_list) if conc_list else float('inf') + + lowest_conc_entry = min( + target_config[Fields.SEARCH_SPACE.value], key=get_lowest_conc) + + conc_list = lowest_conc_entry.get(Fields.CONC_LIST.value, []) + lowest_conc = min(conc_list) if conc_list else 1 + + spec_decoding = lowest_conc_entry.get( + Fields.SPEC_DECODING.value, "none") + prefill_config = lowest_conc_entry[Fields.PREFILL.value] + decode_config = lowest_conc_entry[Fields.DECODE.value] + + for node in runner_nodes: + entry = { + Fields.IMAGE.value: val[Fields.IMAGE.value], + Fields.MODEL.value: val[Fields.MODEL.value], + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: val[Fields.PRECISION.value], + Fields.FRAMEWORK.value: val[Fields.FRAMEWORK.value], + Fields.RUNNER.value: node, + Fields.ISL.value: 1024, + Fields.OSL.value: 1024, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.PREFILL.value: { + Fields.NUM_WORKER.value: prefill_config[Fields.NUM_WORKER.value], + Fields.TP.value: prefill_config[Fields.TP.value], + Fields.EP.value: prefill_config[Fields.EP.value], + Fields.DP_ATTN.value: prefill_config[Fields.DP_ATTN.value], + Fields.ADDITIONAL_SETTINGS.value: prefill_config.get(Fields.ADDITIONAL_SETTINGS.value, []), + }, + Fields.DECODE.value: { + Fields.NUM_WORKER.value: decode_config[Fields.NUM_WORKER.value], + Fields.TP.value: decode_config[Fields.TP.value], + Fields.EP.value: decode_config[Fields.EP.value], + Fields.DP_ATTN.value: decode_config[Fields.DP_ATTN.value], + Fields.ADDITIONAL_SETTINGS.value: decode_config.get(Fields.ADDITIONAL_SETTINGS.value, []), + }, + Fields.CONC.value: [lowest_conc], + Fields.MAX_MODEL_LEN.value: 2048, + Fields.EXP_NAME.value: f"{model_code}_test", + Fields.DISAGG.value: disagg, + } + matrix_values.append(validate_matrix_entry(entry, is_multinode=True)) + else: + # Single-node: pick highest TP config with lowest concurrency + highest_tp_bmk = max( + target_config[Fields.SEARCH_SPACE.value], key=lambda x: x[Fields.TP.value]) + highest_tp = highest_tp_bmk[Fields.TP.value] + lowest_conc = highest_tp_bmk[Fields.CONC_START.value] + + ep = highest_tp_bmk.get(Fields.EP.value) + dp_attn = highest_tp_bmk.get(Fields.DP_ATTN.value) + spec_decoding = highest_tp_bmk.get(Fields.SPEC_DECODING.value, "none") + + for node in runner_nodes: + entry = { + Fields.IMAGE.value: val[Fields.IMAGE.value], + Fields.MODEL.value: val[Fields.MODEL.value], + Fields.MODEL_PREFIX.value: model_code, + Fields.PRECISION.value: val[Fields.PRECISION.value], + Fields.FRAMEWORK.value: val[Fields.FRAMEWORK.value], + Fields.RUNNER.value: node, + Fields.ISL.value: 1024, + Fields.OSL.value: 1024, + Fields.TP.value: highest_tp, + Fields.EP.value: ep if ep is not None else 1, + Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.CONC.value: lowest_conc, + Fields.MAX_MODEL_LEN.value: 2048, + Fields.EXP_NAME.value: f"{model_code}_test", + Fields.DISAGG.value: disagg, + } + matrix_values.append(validate_matrix_entry(entry, is_multinode=False)) + + return matrix_values + + +def load_config_files(config_files): + """Load and merge configuration files.""" + all_config_data = {} + for config_file in config_files: + try: + with open(config_file, 'r') as f: + config_data = yaml.safe_load(f) + assert isinstance( + config_data, dict), f"Config file '{config_file}' must contain a dictionary" + + # Check for duplicate keys, this is only in place to prevent against the very unlikely + # case where an entry in one config accidentally/purposefully tries to override an entry in another config + duplicate_keys = set(all_config_data.keys()) & set( + config_data.keys()) + if duplicate_keys: + raise ValueError( + f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" + ) + + all_config_data.update(config_data) + except FileNotFoundError: + raise ValueError(f"Input file '{config_file}' does not exist.") + + return all_config_data + + +def load_runner_file(runner_file): + """Load runner configuration file.""" + try: + with open(runner_file, 'r') as f: + runner_config = yaml.safe_load(f) + except FileNotFoundError as e: + raise ValueError( + f"Runner config file '{runner_file}' does not exist.") + + return runner_config + + +def main(): + # Create parent parser with common arguments + parent_parser = argparse.ArgumentParser(add_help=False) + parent_parser.add_argument( + '--config-files', + nargs='+', + required=True, + help='One or more configuration files (YAML format)' + ) + parent_parser.add_argument( + '--runner-config', + required=True, + help='Configuration file holding runner information (YAML format)' + ) + + # Create main parser + parser = argparse.ArgumentParser( + description='Generate benchmark configurations from YAML config files' + ) + + # Create subparsers for subcommands + subparsers = parser.add_subparsers( + dest='command', + required=True, + help='Available commands' + ) + + # Subcommand: full-sweep + full_sweep_parser = subparsers.add_parser( + 'full-sweep', + parents=[parent_parser], + add_help=False, + help='Generate full sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths' + ) + full_sweep_parser.add_argument( + '--model-prefix', + nargs='+', + required=False, + help='Model prefix(es) to filter configurations (optional, can specify multiple)' + ) + full_sweep_parser.add_argument( + '--precision', + nargs='+', + required=False, + help='Precision(s) to filter by (e.g., fp4, fp8) (optional, can specify multiple)' + ) + full_sweep_parser.add_argument( + '--framework', + nargs='+', + required=False, + help='Framework(s) to filter by (e.g., vllm, trt, sglang) (optional, can specify multiple)' + ) + full_sweep_parser.add_argument( + '--runner-type', + nargs='+', + required=False, + help='Runner type(s) to filter by (e.g., h200, h100) (optional, can specify multiple)' + ) + full_sweep_parser.add_argument( + '--seq-lens', + nargs='+', + choices=list(seq_len_stoi.keys()), + required=False, + help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." + ) + full_sweep_parser.add_argument( + '--step-size', + type=int, + default=2, + help='Step size for concurrency values (default: 2)' + ) + full_sweep_parser.add_argument( + '--max-conc', + type=int, + required=False, + help='Maximum concurrency value to include (filters out higher concurrency values)' + ) + full_sweep_parser.add_argument( + '--max-tp', + type=int, + required=False, + help='Maximum tensor parallelism value to include (single-node only)' + ) + full_sweep_parser.add_argument( + '--max-ep', + type=int, + required=False, + help='Maximum expert parallelism value to include (single-node only)' + ) + node_type_group = full_sweep_parser.add_mutually_exclusive_group(required=True) + node_type_group.add_argument( + '--single-node', + action='store_true', + help='Only generate single-node configurations' + ) + node_type_group.add_argument( + '--multi-node', + action='store_true', + help='Only generate multi-node configurations' + ) + full_sweep_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + + # Subcommand: runner-model-sweep + test_config_parser = subparsers.add_parser( + 'runner-model-sweep', + parents=[parent_parser], + add_help=False, + help='Given a runner type, find all configurations matching the type, and run that configuration on all individual runner nodes for the specified runner type. This is meant to validate that all runner nodes work on all configurations for a runner type. For instance, to validate that all configs that specify an h200 runner successfully run across all h200 runner nodes.' + ) + test_config_parser.add_argument( + '--runner-type', + required=True, + help='Runner type (e.g., b200-trt, h100)' + ) + test_config_parser.add_argument( + '--runner-node-filter', + required=False, + help='Filter runner nodes by substring match (e.g., "mi300x-amd" to only include nodes containing that string)' + ) + test_node_group = test_config_parser.add_mutually_exclusive_group( + required=True) + test_node_group.add_argument( + '--single-node', + action='store_true', + help='Generate single-node configurations only' + ) + test_node_group.add_argument( + '--multi-node', + action='store_true', + help='Generate multi-node configurations only' + ) + test_config_parser.add_argument( + '-h', '--help', + action='help', + help='Show this help message and exit' + ) + + args = parser.parse_args() + + # Load and validate configuration files + all_config_data = load_config_files(args.config_files) + runner_data = load_runner_file(args.runner_config) + validate_master_config(all_config_data) + validate_runner_config(runner_data) + + # Route to appropriate function based on subcommand + if args.command == 'full-sweep': + matrix_values = generate_full_sweep(args, all_config_data, runner_data) + elif args.command == 'runner-model-sweep': + matrix_values = generate_runner_model_sweep_config( + args, all_config_data, runner_data) + else: + parser.error(f"Unknown command: {args.command}") + + print(json.dumps(matrix_values)) + return matrix_values + + +if __name__ == "__main__": + main() diff --git a/utils/matrix-logic/pytest.ini b/utils/matrix_logic/pytest.ini similarity index 100% rename from utils/matrix-logic/pytest.ini rename to utils/matrix_logic/pytest.ini diff --git a/utils/matrix_logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py new file mode 100644 index 000000000..1381f394e --- /dev/null +++ b/utils/matrix_logic/test_generate_sweep_configs.py @@ -0,0 +1,1034 @@ +"""Comprehensive tests for generate_sweep_configs.py""" +import pytest +import argparse +from generate_sweep_configs import ( + seq_len_stoi, + seq_len_itos, + seq_len_to_str, + generate_full_sweep, + generate_runner_model_sweep_config, + load_config_files, + load_runner_file, +) + + +# ============================================================================= +# Test Fixtures +# ============================================================================= + +@pytest.fixture +def sample_single_node_config(): + """Single node config based on dsr1-fp8-mi300x-sglang.""" + return { + "dsr1-fp8-mi300x-sglang": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "sglang", + "runner": "mi300x", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }, + { + "isl": 8192, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + } + } + + +@pytest.fixture +def sample_multinode_config(): + """Multinode config based on dsr1-fp4-gb200-dynamo-trt.""" + return { + "dsr1-fp4-gb200-dynamo-trt": { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "disagg": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-list": [2150], + "prefill": { + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + "additional-settings": [ + "PREFILL_MAX_NUM_TOKENS=8448", + "PREFILL_MAX_BATCH_SIZE=1", + ], + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + "additional-settings": [ + "DECODE_MAX_NUM_TOKENS=256", + "DECODE_MAX_BATCH_SIZE=256", + ], + }, + } + ] + } + ] + } + } + + +@pytest.fixture +def sample_runner_config(): + """Runner config based on .github/configs/runners.yaml.""" + return { + "h100": ["h100-cr_0", "h100-cr_1", "h100-cw_0", "h100-cw_1"], + "h200": ["h200-cw_0", "h200-cw_1", "h200-nb_0", "h200-nb_1"], + "b200": ["b200-nvd_0", "b200-nvd_1", "b200-dgxc_1"], + "mi300x": ["mi300x-amd_0", "mi300x-amd_1", "mi300x-cr_0"], + "gb200": ["gb200-nv_0"], + } + + +@pytest.fixture +def full_sweep_args_single_node(): + """Args for full-sweep single-node command.""" + args = argparse.Namespace() + args.model_prefix = None + args.precision = None + args.framework = None + args.runner_type = None + args.seq_lens = None + args.step_size = 2 + args.max_conc = None + args.max_tp = None + args.max_ep = None + args.single_node = True + args.multi_node = False + return args + + +@pytest.fixture +def full_sweep_args_multi_node(): + """Args for full-sweep multi-node command.""" + args = argparse.Namespace() + args.model_prefix = None + args.precision = None + args.framework = None + args.runner_type = None + args.seq_lens = None + args.step_size = 2 + args.max_conc = None + args.max_tp = None + args.max_ep = None + args.single_node = False + args.multi_node = True + return args + + +# ============================================================================= +# Test seq_len mappings +# ============================================================================= + +class TestSeqLenMappings: + """Tests for sequence length string mappings.""" + + def test_seq_len_stoi_values(self): + """Verify seq_len_stoi has expected mappings.""" + assert seq_len_stoi["1k1k"] == (1024, 1024) + assert seq_len_stoi["1k8k"] == (1024, 8192) + assert seq_len_stoi["8k1k"] == (8192, 1024) + + def test_seq_len_itos_reverse_mapping(self): + """Verify seq_len_itos is reverse of stoi.""" + assert seq_len_itos[(1024, 1024)] == "1k1k" + assert seq_len_itos[(1024, 8192)] == "1k8k" + assert seq_len_itos[(8192, 1024)] == "8k1k" + + +class TestSeqLenToStr: + """Tests for seq_len_to_str function.""" + + def test_known_sequence_lengths(self): + """Known sequence lengths should return short name.""" + assert seq_len_to_str(1024, 1024) == "1k1k" + assert seq_len_to_str(1024, 8192) == "1k8k" + assert seq_len_to_str(8192, 1024) == "8k1k" + + def test_unknown_sequence_lengths(self): + """Unknown sequence lengths should return isl_osl format.""" + assert seq_len_to_str(2048, 2048) == "2048_2048" + assert seq_len_to_str(4096, 1024) == "4096_1024" + + +# ============================================================================= +# Test generate_full_sweep for single-node +# ============================================================================= + +class TestGenerateFullSweepSingleNode: + """Tests for generate_full_sweep with single-node configs.""" + + def test_basic_sweep_generation(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Basic single-node sweep should generate entries.""" + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 + # With step_size=2, conc goes 4, 8, 16, 32, 64 = 5 values per seq-len config + # 2 seq-len configs * 5 = 10 entries + assert len(result) == 10 + + def test_matrix_entry_structure(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Generated entries should have correct structure.""" + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + entry = result[0] + assert entry["image"] == "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915" + assert entry["model"] == "deepseek-ai/DeepSeek-R1-0528" + assert entry["precision"] == "fp8" + assert entry["framework"] == "sglang" + assert entry["runner"] == "mi300x" + assert entry["tp"] == 8 + assert "exp-name" in entry + assert "max-model-len" in entry + + def test_filter_by_model_prefix(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by model prefix should work.""" + full_sweep_args_single_node.model_prefix = ["dsr1"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 + + # Non-matching prefix should return empty + full_sweep_args_single_node.model_prefix = ["nonexistent"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0 + + def test_filter_by_precision(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by precision should work.""" + full_sweep_args_single_node.precision = ["fp8"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 + + full_sweep_args_single_node.precision = ["fp4"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0 + + def test_filter_by_framework(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by framework should work.""" + full_sweep_args_single_node.framework = ["sglang"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 + + full_sweep_args_single_node.framework = ["vllm"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0 + + def test_filter_by_runner_type(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by runner type should work.""" + full_sweep_args_single_node.runner_type = ["mi300x"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 + + full_sweep_args_single_node.runner_type = ["h100"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0 + + def test_invalid_runner_type_raises_error(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Invalid runner type should raise ValueError.""" + full_sweep_args_single_node.runner_type = ["invalid_runner"] + with pytest.raises(ValueError) as exc_info: + generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert "Invalid runner type" in str(exc_info.value) + + def test_filter_by_seq_lens(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by sequence lengths should work.""" + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + # Only 1k1k entries, 5 concurrency values + assert len(result) == 5 + assert all(entry["isl"] == 1024 and entry["osl"] == 1024 for entry in result) + + def test_max_conc_filter(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_conc filter should limit concurrency values.""" + full_sweep_args_single_node.max_conc = 16 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + # conc values: 4, 8, 16 (32, 64 filtered out) + assert len(result) == 3 + assert all(entry["conc"] <= 16 for entry in result) + + def test_max_conc_creates_config_when_below_min(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_conc below config's min should create config with max_conc value.""" + # Config has conc-start=4, so max_conc=1 should create entry with conc=1 + full_sweep_args_single_node.max_conc = 1 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + # Should create 1 entry with conc=1 + assert len(result) == 1 + assert result[0]["conc"] == 1 + + def test_max_conc_zero_or_negative_skips(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_conc of 0 or negative should skip configs.""" + for invalid_value in [0, -1, -100]: + full_sweep_args_single_node.max_conc = invalid_value + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0, f"Expected 0 results for max_conc={invalid_value}" + + def test_max_tp_filter(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_tp filter should use max_tp when config tp exceeds it.""" + full_sweep_args_single_node.max_tp = 4 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + # tp=8 in config, but max_tp=4, so should use tp=4 + assert len(result) > 0 + assert all(entry["tp"] == 4 for entry in result) + + def test_max_tp_creates_config_when_below_min(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_tp below config's tp should create config with max_tp value.""" + # Config has tp=8, so max_tp=2 should create entries with tp=2 + full_sweep_args_single_node.max_tp = 2 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 + assert all(entry["tp"] == 2 for entry in result) + + def test_max_tp_zero_or_negative_skips(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_tp of 0 or negative should skip configs.""" + for invalid_value in [0, -1, -100]: + full_sweep_args_single_node.max_tp = invalid_value + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0, f"Expected 0 results for max_tp={invalid_value}" + + def test_step_size(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Different step sizes should affect concurrency progression.""" + full_sweep_args_single_node.step_size = 4 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + # conc: 4, 16, 64 = 3 values + assert len(result) == 3 + conc_values = [entry["conc"] for entry in result] + assert 4 in conc_values + assert 16 in conc_values + assert 64 in conc_values + + def test_exp_name_format(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """exp-name should have correct format.""" + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert all(entry["exp-name"] == "dsr1_1k1k" for entry in result) + + def test_max_model_len_calculation(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max-model-len should be isl + osl + 200.""" + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + for entry in result: + expected_max_model_len = entry["isl"] + entry["osl"] + 200 + assert entry["max-model-len"] == expected_max_model_len + + +# ============================================================================= +# Test generate_full_sweep for multi-node +# ============================================================================= + +class TestGenerateFullSweepMultiNode: + """Tests for generate_full_sweep with multi-node configs.""" + + def test_multinode_sweep_generation(self, sample_multinode_config, sample_runner_config, full_sweep_args_multi_node): + """Multinode sweep should generate entries with prefill/decode.""" + result = generate_full_sweep( + full_sweep_args_multi_node, + sample_multinode_config, + sample_runner_config + ) + assert len(result) == 1 # One entry with conc-list + + def test_multinode_entry_structure(self, sample_multinode_config, sample_runner_config, full_sweep_args_multi_node): + """Multinode entries should have prefill and decode configs.""" + result = generate_full_sweep( + full_sweep_args_multi_node, + sample_multinode_config, + sample_runner_config + ) + entry = result[0] + assert "prefill" in entry + assert "decode" in entry + assert entry["prefill"]["num-worker"] == 5 + assert entry["decode"]["num-worker"] == 1 + assert entry["disagg"] is True + + def test_multinode_conc_as_list(self, sample_multinode_config, sample_runner_config, full_sweep_args_multi_node): + """Multinode conc should be passed as list.""" + result = generate_full_sweep( + full_sweep_args_multi_node, + sample_multinode_config, + sample_runner_config + ) + entry = result[0] + assert isinstance(entry["conc"], list) + assert entry["conc"] == [2150] + + def test_single_node_flag_skips_multinode(self, sample_multinode_config, sample_runner_config, full_sweep_args_single_node): + """Single-node flag should skip multinode configs.""" + result = generate_full_sweep( + full_sweep_args_single_node, + sample_multinode_config, + sample_runner_config + ) + assert len(result) == 0 + + +# ============================================================================= +# Test generate_runner_model_sweep_config +# ============================================================================= + +class TestGenerateRunnerModelSweepConfig: + """Tests for generate_runner_model_sweep_config function.""" + + @pytest.fixture + def runner_sweep_args(self): + """Args for runner-model-sweep command (single-node).""" + args = argparse.Namespace() + args.runner_type = "mi300x" + args.runner_config = "runners.yaml" + args.runner_node_filter = None + args.single_node = True + args.multi_node = False + return args + + def test_basic_runner_sweep(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Basic runner sweep should generate entries for each node.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + # 3 mi300x nodes + assert len(result) == 3 + + def test_runner_sweep_entry_structure(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Runner sweep entries should use 1k1k config.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + for entry in result: + assert entry["isl"] == 1024 + assert entry["osl"] == 1024 + assert entry["max-model-len"] == 2048 + assert "_test" in entry["exp-name"] + + def test_each_node_gets_entry(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Each runner node should get its own entry.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + runners = [entry["runner"] for entry in result] + assert "mi300x-amd_0" in runners + assert "mi300x-amd_1" in runners + assert "mi300x-cr_0" in runners + + def test_invalid_runner_type(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Invalid runner type should raise error.""" + runner_sweep_args.runner_type = "nonexistent" + with pytest.raises(ValueError) as exc_info: + generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + assert "does not exist" in str(exc_info.value) + + def test_runner_node_filter(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Runner node filter should limit nodes.""" + runner_sweep_args.runner_node_filter = "amd" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + # Only mi300x-amd_0 and mi300x-amd_1 match + assert len(result) == 2 + assert all("amd" in entry["runner"] for entry in result) + + def test_runner_node_filter_no_match(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Runner node filter with no matches should raise error.""" + runner_sweep_args.runner_node_filter = "nonexistent" + with pytest.raises(ValueError) as exc_info: + generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + assert "No runner nodes found" in str(exc_info.value) + + def test_uses_highest_tp(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Should use highest TP from search space.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + # Config has tp=8 + assert all(entry["tp"] == 8 for entry in result) + + def test_uses_lowest_conc(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Should use lowest concurrency from search space.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + # Config has conc-start=4 + assert all(entry["conc"] == 4 for entry in result) + + +# ============================================================================= +# Test load_config_files +# ============================================================================= + +class TestLoadConfigFiles: + """Tests for load_config_files function.""" + + def test_load_single_file(self, tmp_path): + """Should load a single config file.""" + config_file = tmp_path / "config.yaml" + config_file.write_text(""" +test-config: + image: test-image + model: test-model +""") + result = load_config_files([str(config_file)]) + assert "test-config" in result + assert result["test-config"]["image"] == "test-image" + + def test_load_multiple_files(self, tmp_path): + """Should merge multiple config files.""" + config1 = tmp_path / "config1.yaml" + config1.write_text(""" +config-one: + value: 1 +""") + config2 = tmp_path / "config2.yaml" + config2.write_text(""" +config-two: + value: 2 +""") + result = load_config_files([str(config1), str(config2)]) + assert "config-one" in result + assert "config-two" in result + + def test_duplicate_keys_raise_error(self, tmp_path): + """Duplicate keys across files should raise error.""" + config1 = tmp_path / "config1.yaml" + config1.write_text(""" +duplicate-key: + value: 1 +""") + config2 = tmp_path / "config2.yaml" + config2.write_text(""" +duplicate-key: + value: 2 +""") + with pytest.raises(ValueError) as exc_info: + load_config_files([str(config1), str(config2)]) + assert "Duplicate configuration keys" in str(exc_info.value) + + def test_nonexistent_file_raises_error(self): + """Nonexistent file should raise error.""" + with pytest.raises(ValueError) as exc_info: + load_config_files(["nonexistent.yaml"]) + assert "does not exist" in str(exc_info.value) + + +# ============================================================================= +# Test load_runner_file +# ============================================================================= + +class TestLoadRunnerFile: + """Tests for load_runner_file function.""" + + def test_load_runner_file(self, tmp_path): + """Should load runner config file.""" + runner_file = tmp_path / "runners.yaml" + runner_file.write_text(""" +h100: +- h100-node-0 +- h100-node-1 +""") + result = load_runner_file(str(runner_file)) + assert "h100" in result + assert len(result["h100"]) == 2 + + def test_nonexistent_runner_file(self): + """Nonexistent runner file should raise error.""" + with pytest.raises(ValueError) as exc_info: + load_runner_file("nonexistent.yaml") + assert "does not exist" in str(exc_info.value) + + +# ============================================================================= +# Test edge cases and special configurations +# ============================================================================= + +class TestEdgeCases: + """Tests for edge cases and special configurations.""" + + def test_config_with_ep_and_dp_attn(self, sample_runner_config, full_sweep_args_single_node): + """Config with ep and dp-attn should be handled correctly.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "sglang", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 4, "ep": 4, "dp-attn": True, "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + assert len(result) == 1 + assert result[0]["ep"] == 4 + assert result[0]["dp-attn"] is True + + def test_config_with_spec_decoding(self, sample_runner_config, full_sweep_args_single_node): + """Config with spec-decoding should be handled correctly.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "trt", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "spec-decoding": "mtp", "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + assert len(result) == 1 + assert result[0]["spec-decoding"] == "mtp" + + def test_conc_list_in_single_node(self, sample_runner_config, full_sweep_args_single_node): + """Single node config with conc-list should work.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp8", + "framework": "sglang", + "runner": "mi300x", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 16} + ] + } + ] + } + } + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + conc_values = [entry["conc"] for entry in result] + assert 4 in conc_values + assert 8 in conc_values + assert 16 in conc_values + + def test_disagg_defaults_to_false(self, sample_runner_config, full_sweep_args_single_node): + """disagg should default to False when not specified.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp8", + "framework": "sglang", + "runner": "mi300x", + "multinode": False, + # No disagg field + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + assert result[0]["disagg"] is False + + def test_multinode_conc_range_expansion(self, sample_runner_config, full_sweep_args_multi_node): + """Multinode with conc range should expand to list.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-start": 1, + "conc-end": 8, + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": False, + }, + } + ] + } + ] + } + } + result = generate_full_sweep( + full_sweep_args_multi_node, + config, + sample_runner_config + ) + assert len(result) == 1 + # step_size=2: 1, 2, 4, 8 + assert result[0]["conc"] == [1, 2, 4, 8] + + def test_max_ep_creates_config_when_below_min(self, sample_runner_config, full_sweep_args_single_node): + """max_ep below config's ep should create config with max_ep value.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "sglang", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + full_sweep_args_single_node.max_ep = 2 + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + # ep=8 in config, but max_ep=2, so should use ep=2 + assert len(result) == 1 + assert result[0]["ep"] == 2 + + def test_max_ep_zero_or_negative_skips(self, sample_runner_config, full_sweep_args_single_node): + """max_ep of 0 or negative should skip configs.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "sglang", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + for invalid_value in [0, -1, -100]: + full_sweep_args_single_node.max_ep = invalid_value + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + assert len(result) == 0, f"Expected 0 results for max_ep={invalid_value}" + + def test_multinode_max_conc_zero_or_negative_skips(self, sample_runner_config, full_sweep_args_multi_node): + """Multinode max_conc of 0 or negative should skip configs.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-list": [100, 200, 400], + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": False, + }, + } + ] + } + ] + } + } + for invalid_value in [0, -1, -100]: + full_sweep_args_multi_node.max_conc = invalid_value + result = generate_full_sweep( + full_sweep_args_multi_node, + config, + sample_runner_config + ) + assert len(result) == 0, f"Expected 0 results for max_conc={invalid_value}" + + def test_multinode_max_conc_creates_config_when_below_min(self, sample_runner_config, full_sweep_args_multi_node): + """Multinode max_conc below all values should create config with max_conc.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-list": [100, 200, 400], + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": False, + }, + } + ] + } + ] + } + } + full_sweep_args_multi_node.max_conc = 1 + result = generate_full_sweep( + full_sweep_args_multi_node, + config, + sample_runner_config + ) + # All conc values (100, 200, 400) > max_conc (1), so should use [1] + assert len(result) == 1 + assert result[0]["conc"] == [1] + + def test_combined_max_filters(self, sample_runner_config, full_sweep_args_single_node): + """Multiple max filters should all apply and create configs with max values.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "sglang", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "ep": 8, "conc-start": 100, "conc-end": 200} + ] + } + ] + } + } + full_sweep_args_single_node.max_tp = 2 + full_sweep_args_single_node.max_ep = 1 + full_sweep_args_single_node.max_conc = 1 + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + # All values exceed max, so should use max values + assert len(result) == 1 + assert result[0]["tp"] == 2 + assert result[0]["ep"] == 1 + assert result[0]["conc"] == 1 diff --git a/utils/matrix_logic/test_validation.py b/utils/matrix_logic/test_validation.py new file mode 100644 index 000000000..008ed2b42 --- /dev/null +++ b/utils/matrix_logic/test_validation.py @@ -0,0 +1,740 @@ +"""Comprehensive tests for validation.py""" +import pytest +from validation import ( + Fields, + SingleNodeMatrixEntry, + MultiNodeMatrixEntry, + WorkerConfig, + SingleNodeSearchSpaceEntry, + MultiNodeSearchSpaceEntry, + SingleNodeSeqLenConfig, + MultiNodeSeqLenConfig, + SingleNodeMasterConfigEntry, + MultiNodeMasterConfigEntry, + validate_matrix_entry, + validate_master_config, + validate_runner_config, +) + + +# ============================================================================= +# Test Fixtures +# ============================================================================= + +@pytest.fixture +def valid_single_node_matrix_entry(): + """Valid single node matrix entry based on dsr1-fp4-mi355x-sglang config.""" + return { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", + "model": "amd/DeepSeek-R1-0528-MXFP4-Preview", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "sglang", + "spec-decoding": "none", + "runner": "mi355x", + "isl": 1024, + "osl": 1024, + "tp": 8, + "ep": 1, + "dp-attn": False, + "conc": 4, + "max-model-len": 2248, + "exp-name": "dsr1_1k1k", + "disagg": False, + } + + +@pytest.fixture +def valid_multinode_matrix_entry(): + """Valid multinode matrix entry based on dsr1-fp4-gb200-dynamo-trt config.""" + return { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trt", + "spec-decoding": "none", + "runner": "gb200", + "isl": 1024, + "osl": 1024, + "prefill": { + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + "additional-settings": [ + "PREFILL_MAX_NUM_TOKENS=8448", + "PREFILL_MAX_BATCH_SIZE=1", + ], + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + "additional-settings": [ + "DECODE_MAX_NUM_TOKENS=256", + "DECODE_MAX_BATCH_SIZE=256", + "DECODE_GPU_MEM_FRACTION=0.8", + "DECODE_MTP_SIZE=0", + ], + }, + "conc": [2150], + "max-model-len": 2248, + "exp-name": "dsr1_1k1k", + "disagg": True, + } + + +@pytest.fixture +def valid_single_node_master_config(): + """Valid single node master config based on dsr1-fp8-mi300x-sglang.""" + return { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "sglang", + "runner": "mi300x", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + } + + +@pytest.fixture +def valid_multinode_master_config(): + """Valid multinode master config based on dsr1-fp4-gb200-dynamo-trt.""" + return { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "disagg": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "prefill": { + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + "additional-settings": [ + "PREFILL_MAX_NUM_TOKENS=8448", + "PREFILL_MAX_BATCH_SIZE=1", + ], + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + "additional-settings": [ + "DECODE_MAX_NUM_TOKENS=256", + "DECODE_MAX_BATCH_SIZE=256", + ], + }, + "conc-list": [2150], + } + ] + } + ] + } + + +@pytest.fixture +def valid_runner_config(): + """Valid runner config based on .github/configs/runners.yaml.""" + return { + "h100": ["h100-cr_0", "h100-cr_1", "h100-cw_0", "h100-cw_1"], + "h200": ["h200-cw_0", "h200-cw_1", "h200-nb_0", "h200-nb_1"], + "b200": ["b200-nvd_0", "b200-nvd_1", "b200-dgxc_1"], + "mi300x": ["mi300x-amd_0", "mi300x-amd_1", "mi300x-cr_0"], + "gb200": ["gb200-nv_0"], + } + + +# ============================================================================= +# Test Fields Enum +# ============================================================================= + +class TestFieldsEnum: + """Tests for Fields enum.""" + + def test_field_values_are_strings(self): + """All field values should be strings.""" + for field in Fields: + assert isinstance(field.value, str) + + def test_key_fields_exist(self): + """Key fields should be defined.""" + assert Fields.IMAGE.value == "image" + assert Fields.MODEL.value == "model" + assert Fields.TP.value == "tp" + assert Fields.MULTINODE.value == "multinode" + assert Fields.CONC.value == "conc" + assert Fields.SPEC_DECODING.value == "spec-decoding" + assert Fields.PREFILL.value == "prefill" + assert Fields.DECODE.value == "decode" + + +# ============================================================================= +# Test WorkerConfig +# ============================================================================= + +class TestWorkerConfig: + """Tests for WorkerConfig model.""" + + def test_valid_worker_config(self): + """Valid worker config should pass.""" + config = WorkerConfig(**{ + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + }) + assert config.num_worker == 5 + assert config.tp == 4 + assert config.ep == 4 + assert config.dp_attn is True + + def test_worker_config_with_additional_settings(self): + """Worker config with additional settings should pass.""" + config = WorkerConfig(**{ + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + "additional-settings": [ + "DECODE_MAX_NUM_TOKENS=256", + "DECODE_MAX_BATCH_SIZE=256", + "DECODE_GPU_MEM_FRACTION=0.8", + ], + }) + assert len(config.additional_settings) == 3 + assert "DECODE_MAX_NUM_TOKENS=256" in config.additional_settings + + def test_worker_config_missing_required_field(self): + """Missing required field should fail.""" + with pytest.raises(Exception): + WorkerConfig(**{ + "num-worker": 2, + "tp": 4, + # Missing ep and dp-attn + }) + + def test_worker_config_extra_field_forbidden(self): + """Extra fields should be forbidden.""" + with pytest.raises(Exception): + WorkerConfig(**{ + "num-worker": 2, + "tp": 4, + "ep": 1, + "dp-attn": False, + "unknown-field": "value", + }) + + +# ============================================================================= +# Test SingleNodeMatrixEntry +# ============================================================================= + +class TestSingleNodeMatrixEntry: + """Tests for SingleNodeMatrixEntry model.""" + + def test_valid_entry(self, valid_single_node_matrix_entry): + """Valid entry should pass validation.""" + entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + assert entry.image == "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915" + assert entry.tp == 8 + assert entry.conc == 4 + assert entry.framework == "sglang" + + def test_conc_as_list(self, valid_single_node_matrix_entry): + """Conc can be a list of integers.""" + valid_single_node_matrix_entry["conc"] = [4, 8, 16, 32, 64] + entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + assert entry.conc == [4, 8, 16, 32, 64] + + def test_spec_decoding_values(self, valid_single_node_matrix_entry): + """Spec decoding should accept valid literal values.""" + for value in ["mtp", "draft_model", "none"]: + valid_single_node_matrix_entry["spec-decoding"] = value + entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + assert entry.spec_decoding == value + + def test_invalid_spec_decoding(self, valid_single_node_matrix_entry): + """Invalid spec decoding value should fail.""" + valid_single_node_matrix_entry["spec-decoding"] = "invalid" + with pytest.raises(Exception): + SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + + def test_missing_required_field(self, valid_single_node_matrix_entry): + """Missing required field should fail validation.""" + del valid_single_node_matrix_entry["model"] + with pytest.raises(Exception): + SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + + def test_extra_field_forbidden(self, valid_single_node_matrix_entry): + """Extra fields should be forbidden.""" + valid_single_node_matrix_entry["extra-field"] = "value" + with pytest.raises(Exception): + SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + + +# ============================================================================= +# Test MultiNodeMatrixEntry +# ============================================================================= + +class TestMultiNodeMatrixEntry: + """Tests for MultiNodeMatrixEntry model.""" + + def test_valid_entry(self, valid_multinode_matrix_entry): + """Valid entry should pass validation.""" + entry = MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + assert entry.model == "deepseek-r1-fp4" + assert entry.conc == [2150] + assert entry.disagg is True + + def test_prefill_decode_worker_configs(self, valid_multinode_matrix_entry): + """Prefill and decode should be WorkerConfig objects.""" + entry = MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + assert entry.prefill.num_worker == 5 + assert entry.prefill.tp == 4 + assert entry.decode.tp == 8 + assert entry.decode.dp_attn is True + + def test_conc_must_be_list(self, valid_multinode_matrix_entry): + """Conc must be a list for multinode.""" + valid_multinode_matrix_entry["conc"] = 2150 # Single int, not list + with pytest.raises(Exception): + MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + + def test_missing_prefill(self, valid_multinode_matrix_entry): + """Missing prefill should fail.""" + del valid_multinode_matrix_entry["prefill"] + with pytest.raises(Exception): + MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + + def test_missing_decode(self, valid_multinode_matrix_entry): + """Missing decode should fail.""" + del valid_multinode_matrix_entry["decode"] + with pytest.raises(Exception): + MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + + +# ============================================================================= +# Test validate_matrix_entry function +# ============================================================================= + +class TestValidateMatrixEntry: + """Tests for validate_matrix_entry function.""" + + def test_valid_single_node(self, valid_single_node_matrix_entry): + """Valid single node entry should return the entry.""" + result = validate_matrix_entry(valid_single_node_matrix_entry, is_multinode=False) + assert result == valid_single_node_matrix_entry + + def test_valid_multinode(self, valid_multinode_matrix_entry): + """Valid multinode entry should return the entry.""" + result = validate_matrix_entry(valid_multinode_matrix_entry, is_multinode=True) + assert result == valid_multinode_matrix_entry + + def test_invalid_single_node_raises_valueerror(self, valid_single_node_matrix_entry): + """Invalid single node entry should raise ValueError.""" + del valid_single_node_matrix_entry["tp"] + with pytest.raises(ValueError) as exc_info: + validate_matrix_entry(valid_single_node_matrix_entry, is_multinode=False) + assert "failed validation" in str(exc_info.value) + + def test_invalid_multinode_raises_valueerror(self, valid_multinode_matrix_entry): + """Invalid multinode entry should raise ValueError.""" + del valid_multinode_matrix_entry["prefill"] + with pytest.raises(ValueError) as exc_info: + validate_matrix_entry(valid_multinode_matrix_entry, is_multinode=True) + assert "failed validation" in str(exc_info.value) + + +# ============================================================================= +# Test SingleNodeSearchSpaceEntry +# ============================================================================= + +class TestSingleNodeSearchSpaceEntry: + """Tests for SingleNodeSearchSpaceEntry model.""" + + def test_valid_with_conc_range(self): + """Valid entry with conc range should pass (like mi300x config).""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 8, + "conc-start": 4, + "conc-end": 64, + }) + assert entry.tp == 8 + assert entry.conc_start == 4 + assert entry.conc_end == 64 + + def test_valid_with_conc_list(self): + """Valid entry with conc list should pass.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "conc-list": [4, 8, 16, 32, 64, 128], + }) + assert entry.conc_list == [4, 8, 16, 32, 64, 128] + + def test_cannot_have_both_range_and_list(self): + """Cannot specify both conc range and list.""" + with pytest.raises(Exception) as exc_info: + SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "conc-start": 4, + "conc-end": 64, + "conc-list": [4, 8, 16], + }) + assert "Cannot specify both" in str(exc_info.value) + + def test_must_have_range_or_list(self): + """Must specify either conc range or list.""" + with pytest.raises(Exception) as exc_info: + SingleNodeSearchSpaceEntry(**{ + "tp": 8, + }) + assert "Must specify either" in str(exc_info.value) + + def test_conc_start_must_be_lte_conc_end(self): + """conc-start must be <= conc-end.""" + with pytest.raises(Exception) as exc_info: + SingleNodeSearchSpaceEntry(**{ + "tp": 8, + "conc-start": 64, + "conc-end": 4, + }) + assert "must be <=" in str(exc_info.value) + + def test_conc_list_values_must_be_positive(self): + """conc-list values must be > 0.""" + with pytest.raises(Exception) as exc_info: + SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "conc-list": [4, 0, 16], + }) + assert "must be greater than 0" in str(exc_info.value) + + def test_optional_fields_defaults(self): + """Optional fields should have correct defaults.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 8, + "conc-list": [4, 8], + }) + assert entry.ep is None + assert entry.dp_attn is None + assert entry.spec_decoding == "none" + + def test_with_ep_and_dp_attn(self): + """Entry with ep and dp-attn like b200-sglang config.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "ep": 4, + "dp-attn": True, + "conc-start": 4, + "conc-end": 128, + }) + assert entry.ep == 4 + assert entry.dp_attn is True + + def test_with_spec_decoding_mtp(self): + """Entry with mtp spec decoding.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 8, + "spec-decoding": "mtp", + "conc-list": [1, 2, 4], + }) + assert entry.spec_decoding == "mtp" + + +# ============================================================================= +# Test MultiNodeSearchSpaceEntry +# ============================================================================= + +class TestMultiNodeSearchSpaceEntry: + """Tests for MultiNodeSearchSpaceEntry model.""" + + def test_valid_with_conc_list(self): + """Valid multinode search space with list (like gb200 config).""" + entry = MultiNodeSearchSpaceEntry(**{ + "prefill": { + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + "additional-settings": ["PREFILL_MAX_NUM_TOKENS=8448"], + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + "additional-settings": ["DECODE_MAX_NUM_TOKENS=256"], + }, + "conc-list": [2150], + }) + assert entry.prefill.num_worker == 5 + assert entry.decode.tp == 8 + + def test_valid_with_conc_range(self): + """Valid multinode search space with range.""" + entry = MultiNodeSearchSpaceEntry(**{ + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False, + }, + "conc-start": 1, + "conc-end": 64, + }) + assert entry.conc_start == 1 + assert entry.conc_end == 64 + + def test_with_spec_decoding_mtp(self): + """Multinode entry with mtp spec decoding.""" + entry = MultiNodeSearchSpaceEntry(**{ + "spec-decoding": "mtp", + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False, + }, + "conc-list": [1, 2, 4, 8, 16, 36], + }) + assert entry.spec_decoding == "mtp" + + def test_missing_conc_specification(self): + """Missing conc specification should fail.""" + with pytest.raises(Exception): + MultiNodeSearchSpaceEntry(**{ + "prefill": { + "num-worker": 2, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 2, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + # Missing conc specification + }) + + +# ============================================================================= +# Test SeqLenConfig models +# ============================================================================= + +class TestSeqLenConfigs: + """Tests for sequence length config models.""" + + def test_single_node_seq_len_config_1k1k(self): + """Valid single node seq len config for 1k/1k.""" + config = SingleNodeSeqLenConfig(**{ + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }) + assert config.isl == 1024 + assert config.osl == 1024 + assert len(config.search_space) == 1 + + def test_single_node_seq_len_config_8k1k(self): + """Valid single node seq len config for 8k/1k.""" + config = SingleNodeSeqLenConfig(**{ + "isl": 8192, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }) + assert config.isl == 8192 + assert config.osl == 1024 + + def test_multinode_seq_len_config(self): + """Valid multinode seq len config.""" + config = MultiNodeSeqLenConfig(**{ + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "prefill": { + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + }, + "conc-list": [2150], + } + ] + }) + assert config.isl == 1024 + assert config.osl == 1024 + + +# ============================================================================= +# Test MasterConfigEntry models +# ============================================================================= + +class TestMasterConfigEntries: + """Tests for master config entry models.""" + + def test_single_node_master_config(self, valid_single_node_master_config): + """Valid single node master config.""" + config = SingleNodeMasterConfigEntry(**valid_single_node_master_config) + assert config.multinode is False + assert config.model_prefix == "dsr1" + assert config.runner == "mi300x" + assert config.framework == "sglang" + + def test_multinode_master_config(self, valid_multinode_master_config): + """Valid multinode master config.""" + config = MultiNodeMasterConfigEntry(**valid_multinode_master_config) + assert config.multinode is True + assert config.model_prefix == "dsr1" + assert config.runner == "gb200" + assert config.disagg is True + + def test_single_node_cannot_have_multinode_true(self, valid_single_node_master_config): + """Single node config must have multinode=False.""" + valid_single_node_master_config["multinode"] = True + with pytest.raises(Exception): + SingleNodeMasterConfigEntry(**valid_single_node_master_config) + + def test_multinode_cannot_have_multinode_false(self, valid_multinode_master_config): + """Multinode config must have multinode=True.""" + valid_multinode_master_config["multinode"] = False + with pytest.raises(Exception): + MultiNodeMasterConfigEntry(**valid_multinode_master_config) + + def test_disagg_default_false(self, valid_single_node_master_config): + """Disagg should default to False.""" + config = SingleNodeMasterConfigEntry(**valid_single_node_master_config) + assert config.disagg is False + + +# ============================================================================= +# Test validate_master_config function +# ============================================================================= + +class TestValidateMasterConfig: + """Tests for validate_master_config function.""" + + def test_valid_single_node_config(self, valid_single_node_master_config): + """Valid single node config should pass.""" + configs = {"dsr1-fp8-mi300x-sglang": valid_single_node_master_config} + result = validate_master_config(configs) + assert result == configs + + def test_valid_multinode_config(self, valid_multinode_master_config): + """Valid multinode config should pass.""" + configs = {"dsr1-fp4-gb200-dynamo-trt": valid_multinode_master_config} + result = validate_master_config(configs) + assert result == configs + + def test_mixed_configs(self, valid_single_node_master_config, valid_multinode_master_config): + """Mixed single and multinode configs should pass.""" + configs = { + "dsr1-fp8-mi300x-sglang": valid_single_node_master_config, + "dsr1-fp4-gb200-dynamo-trt": valid_multinode_master_config, + } + result = validate_master_config(configs) + assert len(result) == 2 + + def test_invalid_config_raises_valueerror(self, valid_single_node_master_config): + """Invalid config should raise ValueError with key name.""" + del valid_single_node_master_config["model"] + configs = {"broken-config": valid_single_node_master_config} + with pytest.raises(ValueError) as exc_info: + validate_master_config(configs) + assert "broken-config" in str(exc_info.value) + assert "failed validation" in str(exc_info.value) + + +# ============================================================================= +# Test validate_runner_config function +# ============================================================================= + +class TestValidateRunnerConfig: + """Tests for validate_runner_config function.""" + + def test_valid_runner_config(self, valid_runner_config): + """Valid runner config should pass.""" + result = validate_runner_config(valid_runner_config) + assert result == valid_runner_config + + def test_value_must_be_list(self): + """Runner config values must be lists.""" + config = { + "h100": "h100-cr_0", # Not a list + } + with pytest.raises(ValueError) as exc_info: + validate_runner_config(config) + assert "must be a list" in str(exc_info.value) + + def test_list_must_contain_strings(self): + """Runner config lists must contain only strings.""" + config = { + "h100": ["h100-cr_0", 123], # Contains non-string + } + with pytest.raises(ValueError) as exc_info: + validate_runner_config(config) + assert "must contain only strings" in str(exc_info.value) + + def test_list_cannot_be_empty(self): + """Runner config lists cannot be empty.""" + config = { + "mi355x": [], + } + with pytest.raises(ValueError) as exc_info: + validate_runner_config(config) + assert "cannot be an empty list" in str(exc_info.value) + + def test_multiple_runner_types(self, valid_runner_config): + """Multiple runner types should work.""" + result = validate_runner_config(valid_runner_config) + assert "h100" in result + assert "h200" in result + assert "mi300x" in result + assert "gb200" in result diff --git a/utils/matrix_logic/validation.py b/utils/matrix_logic/validation.py new file mode 100644 index 000000000..30012423a --- /dev/null +++ b/utils/matrix_logic/validation.py @@ -0,0 +1,317 @@ +from pydantic import BaseModel, Field, ValidationError, ConfigDict, model_validator +from typing import List, Optional, Union, Literal +from enum import Enum + +import pprint + +""" + The below class defines the field names expected to be present in the JSON entries + for both single-node and multi-node configurations. +""" + + +class Fields(Enum): + # Field name constants + # Top-level config fields + IMAGE = 'image' + MODEL = 'model' + MODEL_PREFIX = 'model-prefix' + PRECISION = 'precision' + FRAMEWORK = 'framework' + RUNNER = 'runner' + SEQ_LEN_CONFIGS = 'seq-len-configs' + MULTINODE = 'multinode' + + # Seq-len-config fields + ISL = 'isl' + OSL = 'osl' + SEARCH_SPACE = 'search-space' + + # Search-space/benchmark fields + TP = 'tp' + CONC_START = 'conc-start' + CONC_END = 'conc-end' + CONC_LIST = 'conc-list' + EP = 'ep' + DP_ATTN = 'dp-attn' + + # Multinode-specific fields (when MULTINODE = true) + SPEC_DECODING = 'spec-decoding' + PREFILL = 'prefill' + DECODE = 'decode' + NUM_WORKER = 'num-worker' + BATCH_SIZE = 'batch-size' + MAX_NUM_TOKENS = 'max-num-tokens' + ADDITIONAL_SETTINGS = 'additional-settings' + + # Matrix entry fields + CONC = 'conc' + MAX_MODEL_LEN = 'max-model-len' + EXP_NAME = 'exp-name' + DISAGG = 'disagg' + + +""" + Below is the validation logic for the OUTPUT of utils/matrix_logic/generate_sweep_configs.py, i.e., + the input to the actual workflow files. The validation enforces a strict set of rules on the structure + of the generated matrix entries to ensure correctness before proceeding with benchmarking. This ensures + that no validation has to happen in the workflow itself, i.e., at runtime, it is assumed that all inputs + are valid. Threfore, there should not be any default values set in these Pydantic models. Any missing value + should raise a validation error. +""" + + +class SingleNodeMatrixEntry(BaseModel): + """Pydantic model for validating single node matrix entry structure. + This validates the input that should be expected to .github/workflows/benchmark-tmpl.yml""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + alias=Fields.SPEC_DECODING.value + ) + runner: str + isl: int + osl: int + tp: int + ep: int + dp_attn: bool = Field(alias=Fields.DP_ATTN.value) + conc: Union[int, List[int]] + max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value) + exp_name: str = Field(alias=Fields.EXP_NAME.value) + disagg: bool + + +class WorkerConfig(BaseModel): + """Pydantic model for validating worker configuration in multinode entries.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + num_worker: int = Field(alias=Fields.NUM_WORKER.value) + tp: int + ep: int + dp_attn: bool = Field(alias=Fields.DP_ATTN.value) + additional_settings: Optional[List[str]] = Field( + default=[], alias=Fields.ADDITIONAL_SETTINGS.value) + + +class MultiNodeMatrixEntry(BaseModel): + """Pydantic model for validating multinode matrix entry structure. + This validates the input that should be expected to .github/workflows/benchmark-multinode-tmpl.yml""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + alias=Fields.SPEC_DECODING.value + ) + runner: str + isl: int + osl: int + prefill: WorkerConfig + decode: WorkerConfig + conc: List[int] + max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value) + exp_name: str = Field(alias=Fields.EXP_NAME.value) + disagg: bool + + +def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict: + """Validate that matrix_values entries match the expected structure. + + Raises ValueError if any entry fails validation. + Returns the original list if all entries are valid. + """ + try: + if is_multinode: + MultiNodeMatrixEntry(**entry) + else: + SingleNodeMatrixEntry(**entry) + except ValidationError as e: + raise ValueError( + f"The following parsed matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}") + return entry + + +""" + Below is the validation logic for the INPUT to utils/matrix_logic/generate_sweep_configs.py, i.e., + the master configuration files found in .github/configs. The validation enforces a strict set of + rules on the structure of the master configuration files to ensure correctness before proceeding + with matrix generation. +""" + + +def _validate_conc_fields(self): + """Ensure either (conc_start AND conc_end) OR conc_list is provided, but not both.""" + has_range = self.conc_start is not None and self.conc_end is not None + has_list = self.conc_list is not None and len(self.conc_list) > 0 + + if has_range and has_list: + raise ValueError( + f"Cannot specify both '{Fields.CONC_LIST.value}' list and " + f"'{Fields.CONC_START.value}'/'{Fields.CONC_END.value}'. " + "Use either a list or a range, not both." + ) + + if not has_range and not has_list: + raise ValueError( + f"Must specify either '{Fields.CONC_LIST.value}' list or both " + f"'{Fields.CONC_START.value}' and '{Fields.CONC_END.value}'." + ) + + if has_range: + if self.conc_start is None or self.conc_end is None: + raise ValueError( + f"Both '{Fields.CONC_START.value}' and '{Fields.CONC_END.value}' " + "must be provided together." + ) + + if self.conc_start > self.conc_end: + raise ValueError( + f"'{Fields.CONC_START.value}' ({self.conc_start}) must be <= " + f"'{Fields.CONC_END.value}' ({self.conc_end})." + ) + + if has_list: + if not all(x > 0 for x in self.conc_list): + raise ValueError( + f"Input '{Fields.CONC_LIST.value}' entries must be greater than 0." + ) + + return self + + +class SingleNodeSearchSpaceEntry(BaseModel): + """Single node search space configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + tp: int + ep: Optional[int] = None + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + default="none", alias=Fields.SPEC_DECODING.value) + dp_attn: Optional[bool] = Field( + default=None, alias=Fields.DP_ATTN.value) + conc_start: Optional[int] = Field( + default=None, alias=Fields.CONC_START.value) + conc_end: Optional[int] = Field( + default=None, alias=Fields.CONC_END.value) + conc_list: Optional[List[int]] = Field( + default=None, alias=Fields.CONC_LIST.value) + + @model_validator(mode='after') + def validate_conc_fields(self): + return _validate_conc_fields(self) + + +class MultiNodeSearchSpaceEntry(BaseModel): + """Multinode search space configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + default="none", alias=Fields.SPEC_DECODING.value) + prefill: WorkerConfig + decode: WorkerConfig + conc_start: Optional[int] = Field( + default=None, alias=Fields.CONC_START.value) + conc_end: Optional[int] = Field( + default=None, alias=Fields.CONC_END.value) + conc_list: Optional[List[int]] = Field( + default=None, alias=Fields.CONC_LIST.value) + + @model_validator(mode='after') + def validate_conc_fields(self): + return _validate_conc_fields(self) + + +class SingleNodeSeqLenConfig(BaseModel): + """Single node sequence length configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + isl: int + osl: int + search_space: List[SingleNodeSearchSpaceEntry] = Field( + alias=Fields.SEARCH_SPACE.value) + + +class MultiNodeSeqLenConfig(BaseModel): + """Multinode sequence length configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + isl: int + osl: int + search_space: List[MultiNodeSearchSpaceEntry] = Field( + alias=Fields.SEARCH_SPACE.value) + + +class SingleNodeMasterConfigEntry(BaseModel): + """Top-level single node master configuration entry.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + multinode: Literal[False] + disagg: bool = Field(default=False) + seq_len_configs: List[SingleNodeSeqLenConfig] = Field( + alias=Fields.SEQ_LEN_CONFIGS.value) + + +class MultiNodeMasterConfigEntry(BaseModel): + """Top-level multinode master configuration entry.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + multinode: Literal[True] + disagg: bool = Field(default=False) + seq_len_configs: List[MultiNodeSeqLenConfig] = Field( + alias=Fields.SEQ_LEN_CONFIGS.value) + + +def validate_master_config(master_configs: dict) -> List[dict]: + """Validate input master configuration structure.""" + for key, entry in master_configs.items(): + is_multinode = entry.get('multinode', False) + + try: + if is_multinode: + MultiNodeMasterConfigEntry(**entry) + else: + SingleNodeMasterConfigEntry(**entry) + except ValidationError as e: + raise ValueError( + f"Master config entry '{key}' failed validation:\n{e}") + return master_configs + +# Runner Config Validation + + +def validate_runner_config(runner_configs: dict) -> List[dict]: + """Validate input master configuration structure.""" + for key, value in runner_configs.items(): + if not isinstance(value, list): + raise ValueError( + f"Runner config entry '{key}' must be a list, got {type(value).__name__}") + + if not all(isinstance(item, str) for item in value): + raise ValueError( + f"Runner config entry '{key}' must contain only strings") + + if not value: + raise ValueError( + f"Runner config entry '{key}' cannot be an empty list") + + return runner_configs diff --git a/utils/process_result.py b/utils/process_result.py index 1a59ce301..0a84a1f18 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -4,58 +4,122 @@ from pathlib import Path -hw = os.environ.get('RUNNER_TYPE') -tp_size = int(os.environ.get('TP')) -ep_size = int(os.environ.get('EP_SIZE')) -prefill_gpus_str = os.environ.get('PREFILL_GPUS', '') -decode_gpus_str = os.environ.get('DECODE_GPUS', '') - -# If empty string (aggregated runs), assign to tp_size (total gpus), otherwise convert to int -prefill_gpus = tp_size if not prefill_gpus_str else int(prefill_gpus_str) -decode_gpus = tp_size if not decode_gpus_str else int(decode_gpus_str) -dp_attention = os.environ.get('DP_ATTENTION') -result_filename = os.environ.get('RESULT_FILENAME') -framework = os.environ.get('FRAMEWORK') -precision = os.environ.get('PRECISION') -mtp_mode = os.environ.get('MTP_MODE') -isl = os.environ.get('ISL') -osl = os.environ.get('OSL') +def get_required_env_vars(required_vars): + """Load and validate required environment variables.""" + env_values = {} + missing_env_vars = [] + + for var_name in required_vars: + value = os.environ.get(var_name) + if value is None: + missing_env_vars.append(var_name) + env_values[var_name] = value + + if missing_env_vars: + raise EnvironmentError( + f"Missing required environment variables: {', '.join(missing_env_vars)}") + + return env_values + + +# Base required env vars +base_env = get_required_env_vars([ + 'RUNNER_TYPE', 'FRAMEWORK', 'PRECISION', 'SPEC_DECODING', + 'RESULT_FILENAME', 'ISL', 'OSL', 'DISAGG', 'MODEL_PREFIX', 'IMAGE' +]) + +hw = base_env['RUNNER_TYPE'] +model_prefix = base_env['MODEL_PREFIX'] +framework = base_env['FRAMEWORK'] +precision = base_env['PRECISION'] +spec_decoding = base_env['SPEC_DECODING'] +disagg = base_env['DISAGG'].lower() == 'true' +result_filename = base_env['RESULT_FILENAME'] +isl = base_env['ISL'] +osl = base_env['OSL'] +image = base_env['IMAGE'] with open(f'{result_filename}.json') as f: bmk_result = json.load(f) data = { 'hw': hw, - 'tp': tp_size, - 'ep': ep_size, - 'dp_attention': dp_attention, # true or false 'conc': int(bmk_result['max_concurrency']), + 'image': image, 'model': bmk_result['model_id'], + 'infmax_model_prefix': model_prefix, 'framework': framework, 'precision': precision, - 'isl': int(isl) if isl else None, - 'osl': int(osl) if osl else None, - 'tput_per_gpu': float(bmk_result['total_token_throughput']) / tp_size, - 'output_tput_per_gpu': float(bmk_result['output_throughput']) / decode_gpus, - 'input_tput_per_gpu': (float(bmk_result['total_token_throughput']) - float(bmk_result['output_throughput']) )/ prefill_gpus + 'spec_decoding': spec_decoding, + 'disagg': disagg, + 'isl': int(isl), + 'osl': int(osl), } -# Check if both PREFILL_GPUS and DECODE_GPUS env vars exist and are not empty -if prefill_gpus_str and decode_gpus_str: - data['disagg'] = True - data['num_prefill_gpu'] = prefill_gpus - data['num_decode_gpu'] = decode_gpus +is_multinode = os.environ.get('IS_MULTINODE', 'false').lower() == 'true' + +if is_multinode: + # TODO: Eventually will have to have a separate condition in here for multinode disagg and + # multinode agg. For now, just assume that multinode implies disagg. + + multinode_env = get_required_env_vars(['PREFILL_GPUS', 'DECODE_GPUS', 'PREFILL_NUM_WORKERS', 'PREFILL_TP', + 'PREFILL_EP', 'PREFILL_DP_ATTN', 'DECODE_NUM_WORKERS', 'DECODE_TP', 'DECODE_EP', 'DECODE_DP_ATTN']) + prefill_gpus = int(multinode_env['PREFILL_GPUS']) + decode_gpus = int(multinode_env['DECODE_GPUS']) + prefill_num_workers = int(multinode_env['PREFILL_NUM_WORKERS']) + prefill_tp = int(multinode_env['PREFILL_TP']) + prefill_ep = int(multinode_env['PREFILL_EP']) + prefill_dp_attn = multinode_env['PREFILL_DP_ATTN'] + decode_num_workers = int(multinode_env['DECODE_NUM_WORKERS']) + decode_tp = int(multinode_env['DECODE_TP']) + decode_ep = int(multinode_env['DECODE_EP']) + decode_dp_attn = multinode_env['DECODE_DP_ATTN'] + + multi_node_data = { + 'is_multinode': True, + 'prefill_tp': prefill_tp, + 'prefill_ep': prefill_ep, + 'prefill_dp_attention': prefill_dp_attn, + 'prefill_num_workers': prefill_num_workers, + 'decode_tp': decode_tp, + 'decode_ep': decode_ep, + 'decode_dp_attention': decode_dp_attn, + 'decode_num_workers': decode_num_workers, + 'num_prefill_gpu': prefill_gpus, + 'num_decode_gpu': decode_gpus, + 'tput_per_gpu': float(bmk_result['total_token_throughput']) / (prefill_gpus + decode_gpus), + 'output_tput_per_gpu': float(bmk_result['output_throughput']) / decode_gpus, + 'input_tput_per_gpu': (float(bmk_result['total_token_throughput']) - float(bmk_result['output_throughput'])) / prefill_gpus, + } + + data = data | multi_node_data else: - data['disagg'] = False + if disagg: + raise ValueError("Disaggregated mode requires multinode setup.") + + single_node_env = get_required_env_vars(['TP', 'EP_SIZE', 'DP_ATTENTION']) + tp_size = int(single_node_env['TP']) + ep_size = int(single_node_env['EP_SIZE']) + dp_attention = single_node_env['DP_ATTENTION'] + + single_node_data = { + 'is_multinode': False, + 'tp': tp_size, + 'ep': ep_size, + 'dp_attention': dp_attention, + 'tput_per_gpu': float(bmk_result['total_token_throughput']) / tp_size, + 'output_tput_per_gpu': float(bmk_result['output_throughput']) / tp_size, + 'input_tput_per_gpu': (float(bmk_result['total_token_throughput']) - float(bmk_result['output_throughput'])) / tp_size, + } -if mtp_mode: # MTP - data['mtp'] = mtp_mode + data = data | single_node_data for key, value in bmk_result.items(): if key.endswith('ms'): data[key.replace('_ms', '')] = float(value) / 1000.0 if 'tpot' in key: - data[key.replace('_ms', '').replace('tpot', 'intvty')] = 1000.0 / float(value) + data[key.replace('_ms', '').replace( + 'tpot', 'intvty')] = 1000.0 / float(value) print(json.dumps(data, indent=2)) diff --git a/utils/scrape_image_tag.py b/utils/scrape_image_tag.py deleted file mode 100644 index f45b5f9fd..000000000 --- a/utils/scrape_image_tag.py +++ /dev/null @@ -1,43 +0,0 @@ -import sys -import requests - -repository = sys.argv[1] -auth_url = f'https://auth.docker.io/token?service=registry.docker.io&scope=repository:{repository}:pull' -token = requests.get(auth_url).json()['token'] - -tags_url = f'https://registry-1.docker.io/v2/{repository}/tags/list' -resp = requests.get(tags_url, headers={'Authorization': f'Bearer {token}'}) -resp.raise_for_status() - -vllm_tags = resp.json()['tags'] -valid_tags = [tag for tag in vllm_tags if tag.startswith(sys.argv[2]) and 'rc' not in tag] - - -def make_key_cuda(tag): - ''' - Tag format: vX.Y.Z(.W) - X, Y, Z are numbers - W can be a number or string postN (N is a number) - ''' - vals = tag[1:].split('.') - post = vals[3] if len(vals) == 4 else '0' - key = (int(vals[0]), int(vals[1]), int(vals[2]), post) - return key - -def make_key_rocm(tag): - *_, date = tag.split('_') - try: - key = int(date) - except: - key = -1 - return key - -if repository == 'vllm/vllm-openai': - make_key_fn = make_key_cuda -elif repository == 'rocm/vllm': - make_key_fn = make_key_rocm -else: - raise ValueError(f'Invalid repo {repository}') - -tag = max(valid_tags, key=make_key_fn) -print(f'{repository}:{tag}') diff --git a/utils/summarize.py b/utils/summarize.py index 503da2690..a46c2e02a 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -1,44 +1,127 @@ import sys import json from pathlib import Path +from tabulate import tabulate +# Header constants +MODEL = "Model" +SERVED_MODEL = "Served Model" +HARDWARE = "Hardware" +FRAMEWORK = "Framework" +PRECISION = "Precision" +ISL = "ISL" +OSL = "OSL" +TP = "TP" +EP = "EP" +DP_ATTENTION = "DP Attention" +CONC = "Conc" +TTFT = "TTFT (ms)" +TPOT = "TPOT (ms)" +INTERACTIVITY = "Interactivity (tok/s/user)" +E2EL = "E2EL (s)" +TPUT_PER_GPU = "TPUT per GPU" +OUTPUT_TPUT_PER_GPU = "Output TPUT per GPU" +INPUT_TPUT_PER_GPU = "Input TPUT per GPU" +PREFILL_TP = "Prefill TP" +PREFILL_EP = "Prefill EP" +PREFILL_DP_ATTN = "Prefill DP Attn" +PREFILL_WORKERS = "Prefill Workers" +PREFILL_GPUS = "Prefill GPUs" +DECODE_TP = "Decode TP" +DECODE_EP = "Decode EP" +DECODE_DP_ATTN = "Decode DP Attn" +DECODE_WORKERS = "Decode Workers" +DECODE_GPUS = "Decode GPUs" results = [] results_dir = Path(sys.argv[1]) -for result_path in results_dir.rglob(f'*.json'): +for result_path in results_dir.rglob('*.json'): with open(result_path) as f: result = json.load(f) results.append(result) -results.sort(key=lambda r: (r.get('model', 'unknown'), r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r.get('isl', 0), r.get('osl', 0), r['tp'], r['ep'], r['conc'])) - -summary_header = f'''\ -| Model | Hardware | Framework | Precision | ISL | OSL | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | -| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ -''' -print(summary_header) - -for result in results: - framework = result.get('framework', 'vllm') - precision = result.get('precision', 'fp8') - model = result.get('model', 'unknown') - isl = result.get('isl', 'N/A') - osl = result.get('osl', 'N/A') - print( - f"| {model} " - f"| {result['hw'].upper()} " - f"| {framework.upper()} " - f"| {precision.upper()} " - f"| {isl} " - f"| {osl} " - f"| {result['tp']} " - f"| {result['ep']} " - f"| {result['dp_attention']} " - f"| {result['conc']} " - f"| {(result['median_ttft'] * 1000):.4f} " - f"| {(result['median_tpot'] * 1000):.4f} " - f"| {result['median_intvty']:.4f} " - f"| {result['median_e2el']:.4f} " - f"| {result['tput_per_gpu']:.4f} " - f"| {result['output_tput_per_gpu']:.4f} " - f"| {result['input_tput_per_gpu']:.4f} |" - ) + +single_node_results = [r for r in results if not r['is_multinode']] +multinode_results = [r for r in results if r['is_multinode']] + +# Single-node and multi-node results have different fields and therefore need to be printed separately +if single_node_results: + single_node_results.sort(key=lambda r: ( + r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'])) + + single_node_headers = [ + MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, DP_ATTENTION, + CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU + ] + + single_node_rows = [ + [ + r['infmax_model_prefix'], + r['model'], + r['hw'].upper(), + r['framework'].upper(), + r['precision'].upper(), + r['isl'], + r['osl'], + r['tp'], + r['ep'], + r['dp_attention'], + r['conc'], + f"{r['median_ttft'] * 1000:.4f}", + f"{r['median_tpot'] * 1000:.4f}", + f"{r['median_intvty']:.4f}", + f"{r['median_e2el']:.4f}", + f"{r['tput_per_gpu']:.4f}", + f"{r['output_tput_per_gpu']:.4f}", + f"{r['input_tput_per_gpu']:.4f}", + ] + for r in single_node_results + ] + + print("## Single-Node Results\n") + print(tabulate(single_node_rows, headers=single_node_headers, tablefmt="github")) + print("\n") + +if multinode_results: + multinode_results.sort(key=lambda r: (r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], + r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc'])) + + multinode_headers = [ + MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, + PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, PREFILL_GPUS, + DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, DECODE_GPUS, + CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU + ] + + multinode_rows = [ + [ + r['infmax_model_prefix'], + r['model'], + r['hw'].upper(), + r['framework'].upper(), + r['precision'].upper(), + r['isl'], + r['osl'], + r['prefill_tp'], + r['prefill_ep'], + r['prefill_dp_attention'], + r['prefill_num_workers'], + r['num_prefill_gpu'], + r['decode_tp'], + r['decode_ep'], + r['decode_dp_attention'], + r['decode_num_workers'], + r['num_decode_gpu'], + r['conc'], + f"{r['median_ttft'] * 1000:.4f}", + f"{r['median_tpot'] * 1000:.4f}", + f"{r['median_intvty']:.4f}", + f"{r['median_e2el']:.4f}", + f"{r['tput_per_gpu']:.4f}", + f"{r['output_tput_per_gpu']:.4f}", + f"{r['input_tput_per_gpu']:.4f}", + ] + for r in multinode_results + ] + + print("## Multi-Node Results\n") + print(tabulate(multinode_rows, headers=multinode_headers, tablefmt="github")) diff --git a/utils/test_process_result.py b/utils/test_process_result.py index bb1765acf..2a6389a78 100644 --- a/utils/test_process_result.py +++ b/utils/test_process_result.py @@ -1,150 +1,489 @@ +"""Comprehensive tests for process_result.py + +Since process_result.py executes code at module import time, we test it by: +1. Testing the get_required_env_vars function directly +2. Running the script as a subprocess with mocked environment and files +""" import pytest import json +import subprocess import sys -import importlib.util from pathlib import Path -from io import StringIO - - -def create_mock_result_file(tmp_path): - """Create a mock result JSON file.""" - result_data = { - "max_concurrency": 10, - "model_id": "test-model", - "total_token_throughput": 1000.0, - "output_throughput": 400.0, - "ttft_ms": 50.0, - "tpot_ms": 20.0 + +SCRIPT_PATH = Path(__file__).parent / "process_result.py" + + +# ============================================================================= +# Test Fixtures - Based on real benchmark output structure +# ============================================================================= + +@pytest.fixture +def sample_benchmark_result(): + """Sample benchmark result JSON based on real output structure.""" + return { + "model_id": "deepseek-ai/DeepSeek-R1-0528", + "max_concurrency": 64, + "total_token_throughput": 15000.5, + "output_throughput": 12000.0, + "ttft_p50_ms": 150.5, + "ttft_p99_ms": 250.3, + "tpot_p50_ms": 25.0, + "tpot_p99_ms": 45.0, + "e2e_latency_p50_ms": 1500.0, + "e2e_latency_p99_ms": 2500.0, + } + + +@pytest.fixture +def base_env_vars(): + """Base environment variables for single-node setup.""" + return { + "RUNNER_TYPE": "mi300x", + "FRAMEWORK": "sglang", + "PRECISION": "fp8", + "SPEC_DECODING": "none", + "RESULT_FILENAME": "benchmark_result", + "ISL": "1024", + "OSL": "1024", + "DISAGG": "false", + "MODEL_PREFIX": "dsr1", + } + + +@pytest.fixture +def single_node_env_vars(base_env_vars): + """Environment variables for single-node setup.""" + return { + **base_env_vars, + "TP": "8", + "EP_SIZE": "1", + "DP_ATTENTION": "false", + } + + +@pytest.fixture +def multinode_env_vars(base_env_vars): + """Environment variables for multinode setup based on gb200 config.""" + return { + **base_env_vars, + "RUNNER_TYPE": "gb200", + "FRAMEWORK": "dynamo-trt", + "PRECISION": "fp4", + "DISAGG": "true", + "IS_MULTINODE": "true", + "PREFILL_GPUS": "20", + "DECODE_GPUS": "8", + "PREFILL_NUM_WORKERS": "5", + "PREFILL_TP": "4", + "PREFILL_EP": "4", + "PREFILL_DP_ATTN": "true", + "DECODE_NUM_WORKERS": "1", + "DECODE_TP": "8", + "DECODE_EP": "8", + "DECODE_DP_ATTN": "true", } - result_file = tmp_path / "test_result.json" - with open(result_file, 'w') as f: - json.dump(result_data, f) - return result_file - - -def run_process_result_script(tmp_path): - """Helper to run process_result.py and return the output data.""" - # Create mock result file - create_mock_result_file(tmp_path) - - # Get script path relative to this test file - script_path = Path(__file__).parent / "process_result.py" - spec = importlib.util.spec_from_file_location("process_result", script_path) - module = importlib.util.module_from_spec(spec) - - # Capture stdout - old_stdout = sys.stdout - sys.stdout = StringIO() - - try: - spec.loader.exec_module(module) - output = sys.stdout.getvalue() - return json.loads(output) - finally: - sys.stdout = old_stdout - - -def test_disagg_true_when_both_env_vars_set(tmp_path, monkeypatch): - """Test that disagg=true when both PREFILL_GPUS and DECODE_GPUS are set.""" - # Set environment variables - monkeypatch.setenv('RUNNER_TYPE', 'h200') - monkeypatch.setenv('TP', '8') - monkeypatch.setenv('EP_SIZE', '1') - monkeypatch.setenv('PREFILL_GPUS', '4') - monkeypatch.setenv('DECODE_GPUS', '4') - monkeypatch.setenv('DP_ATTENTION', 'false') - monkeypatch.setenv('RESULT_FILENAME', 'test_result') - monkeypatch.setenv('FRAMEWORK', 'vllm') - monkeypatch.setenv('PRECISION', 'fp8') - - # Change to tmp_path directory - monkeypatch.chdir(tmp_path) - - # Run the script and get output - data = run_process_result_script(tmp_path) - - # Check that disagg is true - assert data['disagg'] is True - # Check that num_prefill_gpu and num_decode_gpu are present - assert data['num_prefill_gpu'] == 4 - assert data['num_decode_gpu'] == 4 - - -def test_disagg_false_when_prefill_gpus_not_set(tmp_path, monkeypatch): - """Test that disagg=false when PREFILL_GPUS is not set.""" - # Set environment variables (without PREFILL_GPUS) - monkeypatch.setenv('RUNNER_TYPE', 'h200') - monkeypatch.setenv('TP', '8') - monkeypatch.setenv('EP_SIZE', '1') - monkeypatch.setenv('DECODE_GPUS', '4') - monkeypatch.setenv('DP_ATTENTION', 'false') - monkeypatch.setenv('RESULT_FILENAME', 'test_result') - monkeypatch.setenv('FRAMEWORK', 'vllm') - monkeypatch.setenv('PRECISION', 'fp8') - - # Change to tmp_path directory - monkeypatch.chdir(tmp_path) - - # Run the script and get output - data = run_process_result_script(tmp_path) - - # Check that disagg is false - assert data['disagg'] is False - # Check that num_prefill_gpu and num_decode_gpu are NOT present - assert 'num_prefill_gpu' not in data - assert 'num_decode_gpu' not in data - - -def test_disagg_false_when_decode_gpus_not_set(tmp_path, monkeypatch): - """Test that disagg=false when DECODE_GPUS is not set.""" - # Set environment variables (without DECODE_GPUS) - monkeypatch.setenv('RUNNER_TYPE', 'h200') - monkeypatch.setenv('TP', '8') - monkeypatch.setenv('EP_SIZE', '1') - monkeypatch.setenv('PREFILL_GPUS', '4') - monkeypatch.setenv('DP_ATTENTION', 'false') - monkeypatch.setenv('RESULT_FILENAME', 'test_result') - monkeypatch.setenv('FRAMEWORK', 'vllm') - monkeypatch.setenv('PRECISION', 'fp8') - - # Change to tmp_path directory - monkeypatch.chdir(tmp_path) - - # Run the script and get output - data = run_process_result_script(tmp_path) - - # Check that disagg is false - assert data['disagg'] is False - # Check that num_prefill_gpu and num_decode_gpu are NOT present - assert 'num_prefill_gpu' not in data - assert 'num_decode_gpu' not in data - - -def test_disagg_false_when_both_env_vars_empty_strings(tmp_path, monkeypatch): - """Test that disagg=false when both PREFILL_GPUS and DECODE_GPUS are empty strings.""" - # Set environment variables with empty strings - monkeypatch.setenv('RUNNER_TYPE', 'h200') - monkeypatch.setenv('TP', '8') - monkeypatch.setenv('EP_SIZE', '1') - monkeypatch.setenv('PREFILL_GPUS', '') - monkeypatch.setenv('DECODE_GPUS', '') - monkeypatch.setenv('DP_ATTENTION', 'false') - monkeypatch.setenv('RESULT_FILENAME', 'test_result') - monkeypatch.setenv('FRAMEWORK', 'vllm') - monkeypatch.setenv('PRECISION', 'fp8') - - # Change to tmp_path directory - monkeypatch.chdir(tmp_path) - - # Run the script and get output - data = run_process_result_script(tmp_path) - - # Check that disagg is false - assert data['disagg'] is False - # Check that num_prefill_gpu and num_decode_gpu are NOT present - assert 'num_prefill_gpu' not in data - assert 'num_decode_gpu' not in data - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) + + +def run_script(tmp_path, env, benchmark_result, result_filename="benchmark_result"): + """Helper to run the process_result.py script.""" + result_file = tmp_path / f"{result_filename}.json" + result_file.write_text(json.dumps(benchmark_result)) + + env = env.copy() + env["RESULT_FILENAME"] = result_filename + + return subprocess.run( + [sys.executable, str(SCRIPT_PATH)], + cwd=tmp_path, + env=env, + capture_output=True, + text=True, + ) + + +# ============================================================================= +# Test get_required_env_vars function +# ============================================================================= + +class TestGetRequiredEnvVars: + """Tests for get_required_env_vars function.""" + + def test_all_vars_present(self, monkeypatch): + """Should return dict when all vars present.""" + monkeypatch.setenv("TEST_VAR_1", "value1") + monkeypatch.setenv("TEST_VAR_2", "value2") + + import os + + def get_required_env_vars(required_vars): + env_values = {} + missing_env_vars = [] + for var_name in required_vars: + value = os.environ.get(var_name) + if value is None: + missing_env_vars.append(var_name) + env_values[var_name] = value + if missing_env_vars: + raise EnvironmentError( + f"Missing required environment variables: {', '.join(missing_env_vars)}") + return env_values + + result = get_required_env_vars(["TEST_VAR_1", "TEST_VAR_2"]) + assert result["TEST_VAR_1"] == "value1" + assert result["TEST_VAR_2"] == "value2" + + def test_missing_vars_raises_error(self, monkeypatch): + """Should raise EnvironmentError when vars missing.""" + import os + + def get_required_env_vars(required_vars): + env_values = {} + missing_env_vars = [] + for var_name in required_vars: + value = os.environ.get(var_name) + if value is None: + missing_env_vars.append(var_name) + env_values[var_name] = value + if missing_env_vars: + raise EnvironmentError( + f"Missing required environment variables: {', '.join(missing_env_vars)}") + return env_values + + monkeypatch.delenv("NONEXISTENT_VAR", raising=False) + + with pytest.raises(EnvironmentError) as exc_info: + get_required_env_vars(["NONEXISTENT_VAR"]) + assert "NONEXISTENT_VAR" in str(exc_info.value) + + +# ============================================================================= +# Test script execution via subprocess +# ============================================================================= + +class TestProcessResultScript: + """Tests for process_result.py script execution.""" + + def test_single_node_processing(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test single-node result processing.""" + result = run_script(tmp_path, single_node_env_vars, sample_benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + + # Verify base fields + assert output_data["hw"] == "mi300x" + assert output_data["framework"] == "sglang" + assert output_data["precision"] == "fp8" + assert output_data["spec_decoding"] == "none" + assert output_data["model"] == "deepseek-ai/DeepSeek-R1-0528" + assert output_data["conc"] == 64 + assert output_data["isl"] == 1024 + assert output_data["osl"] == 1024 + assert output_data["disagg"] is False + + # Verify single-node specific fields + assert output_data["is_multinode"] is False + assert output_data["tp"] == 8 + assert output_data["ep"] == 1 + assert output_data["dp_attention"] == "false" + + # Verify throughput calculations (divided by tp=8) + assert output_data["tput_per_gpu"] == pytest.approx(15000.5 / 8) + assert output_data["output_tput_per_gpu"] == pytest.approx(12000.0 / 8) + assert output_data["input_tput_per_gpu"] == pytest.approx((15000.5 - 12000.0) / 8) + + # Verify latency conversions (ms to seconds) + assert output_data["ttft_p50"] == pytest.approx(0.1505) + assert output_data["ttft_p99"] == pytest.approx(0.2503) + assert output_data["e2e_latency_p50"] == pytest.approx(1.5) + assert output_data["e2e_latency_p99"] == pytest.approx(2.5) + + # Verify interactivity calculations (1000 / tpot_ms) + assert output_data["intvty_p50"] == pytest.approx(1000.0 / 25.0) + assert output_data["intvty_p99"] == pytest.approx(1000.0 / 45.0) + + # Verify output file created + output_file = tmp_path / "agg_benchmark_result.json" + assert output_file.exists() + + def test_multinode_processing(self, tmp_path, sample_benchmark_result, multinode_env_vars): + """Test multinode result processing.""" + result = run_script(tmp_path, multinode_env_vars, sample_benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + + # Verify base fields + assert output_data["hw"] == "gb200" + assert output_data["framework"] == "dynamo-trt" + assert output_data["precision"] == "fp4" + assert output_data["disagg"] is True + + # Verify multinode specific fields + assert output_data["is_multinode"] is True + assert output_data["prefill_tp"] == 4 + assert output_data["prefill_ep"] == 4 + assert output_data["prefill_dp_attention"] == "true" + assert output_data["prefill_num_workers"] == 5 + assert output_data["decode_tp"] == 8 + assert output_data["decode_ep"] == 8 + assert output_data["decode_dp_attention"] == "true" + assert output_data["decode_num_workers"] == 1 + assert output_data["num_prefill_gpu"] == 20 + assert output_data["num_decode_gpu"] == 8 + + # Verify throughput calculations + total_gpus = 20 + 8 # prefill + decode + assert output_data["tput_per_gpu"] == pytest.approx(15000.5 / total_gpus) + assert output_data["output_tput_per_gpu"] == pytest.approx(12000.0 / 8) # decode gpus + assert output_data["input_tput_per_gpu"] == pytest.approx((15000.5 - 12000.0) / 20) # prefill gpus + + def test_missing_base_env_vars(self, tmp_path, sample_benchmark_result): + """Test that missing base env vars causes failure.""" + result_file = tmp_path / "benchmark_result.json" + result_file.write_text(json.dumps(sample_benchmark_result)) + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH)], + cwd=tmp_path, + env={"PATH": "/usr/bin", "RESULT_FILENAME": "benchmark_result"}, + capture_output=True, + text=True, + ) + + assert result.returncode != 0 + assert "Missing required environment variables" in result.stderr + + def test_missing_single_node_env_vars(self, tmp_path, sample_benchmark_result, base_env_vars): + """Test that missing single-node env vars causes failure.""" + # base_env_vars doesn't have TP, EP_SIZE, DP_ATTENTION + result = run_script(tmp_path, base_env_vars, sample_benchmark_result) + + assert result.returncode != 0 + assert "Missing required environment variables" in result.stderr + + def test_missing_multinode_env_vars(self, tmp_path, sample_benchmark_result, base_env_vars): + """Test that missing multinode env vars causes failure.""" + env = base_env_vars.copy() + env["IS_MULTINODE"] = "true" + env["DISAGG"] = "true" + # Missing multinode-specific vars + + result = run_script(tmp_path, env, sample_benchmark_result) + + assert result.returncode != 0 + assert "Missing required environment variables" in result.stderr + + def test_disagg_without_multinode_fails(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test that disagg=true without multinode raises error.""" + env = single_node_env_vars.copy() + env["DISAGG"] = "true" # Disagg without multinode + + result = run_script(tmp_path, env, sample_benchmark_result) + + assert result.returncode != 0 + assert "Disaggregated mode requires multinode setup" in result.stderr + + def test_missing_result_file(self, tmp_path, single_node_env_vars): + """Test that missing result file causes failure.""" + env = single_node_env_vars.copy() + env["RESULT_FILENAME"] = "nonexistent" + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH)], + cwd=tmp_path, + env=env, + capture_output=True, + text=True, + ) + + assert result.returncode != 0 + + +# ============================================================================= +# Test latency and throughput calculations +# ============================================================================= + +class TestCalculations: + """Tests for throughput and latency calculations.""" + + def test_latency_ms_to_seconds_conversion(self, tmp_path, single_node_env_vars): + """Test that _ms fields are converted to seconds.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 8, + "total_token_throughput": 1000.0, + "output_throughput": 800.0, + "custom_metric_ms": 500.0, # Should become custom_metric = 0.5 + } + + result = run_script(tmp_path, single_node_env_vars, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["custom_metric"] == pytest.approx(0.5) + + def test_tpot_to_interactivity_conversion(self, tmp_path, single_node_env_vars): + """Test that tpot fields are converted to interactivity.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 8, + "total_token_throughput": 1000.0, + "output_throughput": 800.0, + "tpot_p50_ms": 20.0, # Should become intvty_p50 = 50 + "tpot_p99_ms": 50.0, # Should become intvty_p99 = 20 + } + + result = run_script(tmp_path, single_node_env_vars, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["intvty_p50"] == pytest.approx(50.0) + assert output_data["intvty_p99"] == pytest.approx(20.0) + + def test_throughput_per_gpu_single_node(self, tmp_path, single_node_env_vars): + """Test throughput per GPU calculation for single node.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 8, + "total_token_throughput": 8000.0, + "output_throughput": 6000.0, + } + + env = single_node_env_vars.copy() + env["TP"] = "4" + + result = run_script(tmp_path, env, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["tput_per_gpu"] == pytest.approx(2000.0) # 8000 / 4 + assert output_data["output_tput_per_gpu"] == pytest.approx(1500.0) # 6000 / 4 + assert output_data["input_tput_per_gpu"] == pytest.approx(500.0) # (8000 - 6000) / 4 + + def test_throughput_per_gpu_multinode(self, tmp_path, multinode_env_vars): + """Test throughput per GPU calculation for multinode.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 64, + "total_token_throughput": 28000.0, # Will be divided by total GPUs + "output_throughput": 16000.0, # Will be divided by decode GPUs + } + + env = multinode_env_vars.copy() + env["PREFILL_GPUS"] = "20" + env["DECODE_GPUS"] = "8" + + result = run_script(tmp_path, env, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["tput_per_gpu"] == pytest.approx(1000.0) # 28000 / 28 + assert output_data["output_tput_per_gpu"] == pytest.approx(2000.0) # 16000 / 8 + assert output_data["input_tput_per_gpu"] == pytest.approx(600.0) # (28000 - 16000) / 20 + + +# ============================================================================= +# Test output file generation +# ============================================================================= + +class TestOutputFile: + """Tests for output file generation.""" + + def test_output_file_created(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test that aggregated output file is created.""" + result = run_script(tmp_path, single_node_env_vars, sample_benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_file = tmp_path / "agg_benchmark_result.json" + assert output_file.exists() + + # Verify content matches stdout + with open(output_file) as f: + file_content = json.load(f) + + stdout_content = json.loads(result.stdout) + assert file_content == stdout_content + + def test_output_file_has_correct_prefix(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test that output file has 'agg_' prefix.""" + result = run_script(tmp_path, single_node_env_vars, sample_benchmark_result, "my_custom_result") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_file = tmp_path / "agg_my_custom_result.json" + assert output_file.exists() + + +# ============================================================================= +# Test edge cases +# ============================================================================= + +class TestEdgeCases: + """Tests for edge cases and special scenarios.""" + + def test_boolean_disagg_parsing_false(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test that DISAGG env var is parsed as boolean correctly for false values.""" + for disagg_value in ["false", "False", "FALSE"]: + env = single_node_env_vars.copy() + env["DISAGG"] = disagg_value + + result = run_script(tmp_path, env, sample_benchmark_result) + assert result.returncode == 0, f"Script failed for DISAGG={disagg_value}: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["disagg"] is False + + def test_boolean_disagg_parsing_true_requires_multinode(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test that DISAGG=true without multinode fails.""" + for disagg_value in ["true", "True", "TRUE"]: + env = single_node_env_vars.copy() + env["DISAGG"] = disagg_value + + result = run_script(tmp_path, env, sample_benchmark_result) + assert result.returncode != 0 + + def test_is_multinode_default_false(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test that IS_MULTINODE defaults to false when not set.""" + # Don't set IS_MULTINODE + result = run_script(tmp_path, single_node_env_vars, sample_benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["is_multinode"] is False + + def test_integer_conversion(self, tmp_path, single_node_env_vars): + """Test that numeric env vars are converted to integers.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 32, + "total_token_throughput": 5000.0, + "output_throughput": 4000.0, + } + + env = single_node_env_vars.copy() + env["ISL"] = "8192" + env["OSL"] = "1024" + + result = run_script(tmp_path, env, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["isl"] == 8192 + assert output_data["osl"] == 1024 + assert isinstance(output_data["isl"], int) + assert isinstance(output_data["osl"], int) + + def test_conc_from_benchmark_result(self, tmp_path, single_node_env_vars): + """Test that conc is read from benchmark result max_concurrency.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 128, + "total_token_throughput": 5000.0, + "output_throughput": 4000.0, + } + + result = run_script(tmp_path, single_node_env_vars, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["conc"] == 128