From e605477c53bb9f8f9c989fdd2ee1058c98a3030e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 14:55:58 -0600 Subject: [PATCH 01/98] adding initial changes to master configs; adding initial updates to validation logic and config parser --- .github/configs/amd-master.yaml | 7 + .github/configs/nvidia-master.yaml | 472 ++++++++++ utils/matrix-logic/generate_sweep_configs.py | 873 ++++--------------- utils/matrix-logic/get_test_sweep_configs.py | 151 ---- utils/matrix-logic/validation.py | 290 ++++++ 5 files changed, 962 insertions(+), 831 deletions(-) delete mode 100644 utils/matrix-logic/get_test_sweep_configs.py create mode 100644 utils/matrix-logic/validation.py diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 938011d47..a2674153a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -5,6 +5,7 @@ dsr1-fp4-mi355x-sglang: runner: mi355x precision: fp4 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -27,6 +28,7 @@ dsr1-fp8-mi300x-sglang: runner: mi300x precision: fp8 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -48,6 +50,7 @@ dsr1-fp8-mi325x-sglang: runner: mi325x precision: fp8 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -69,6 +72,7 @@ dsr1-fp8-mi355x-sglang: runner: mi355x precision: fp8 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -90,6 +94,7 @@ gptoss-fp4-mi300x-vllm: runner: mi300x precision: fp4 framework: vllm + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -120,6 +125,7 @@ gptoss-fp4-mi325x-vllm: runner: mi325x precision: fp4 framework: vllm + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -150,6 +156,7 @@ gptoss-fp4-mi355x-vllm: runner: mi355x precision: fp4 framework: vllm + multinode: false seq-len-configs: - isl: 1024 osl: 1024 diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 954abbba2..f83228c85 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -5,6 +5,7 @@ dsr1-fp4-b200-sglang: runner: b200 precision: fp4 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -29,6 +30,7 @@ dsr1-fp4-b200-trt: runner: b200-trt precision: fp4 framework: trt + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -79,6 +81,7 @@ dsr1-fp8-b200-sglang: runner: b200 precision: fp8 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -100,6 +103,7 @@ dsr1-fp8-b200-trt: runner: b200-trt precision: fp8 framework: trt + multinode: false seq-len-configs: # For all sequence lengths, EP=TP - isl: 1024 @@ -126,6 +130,7 @@ dsr1-fp8-h200-sglang: runner: h200 precision: fp8 framework: sglang + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -147,6 +152,7 @@ dsr1-fp8-h200-trt: runner: h200 precision: fp8 framework: trt + multinode: false # For all sequence lengths, EP=TP seq-len-configs: - isl: 1024 @@ -173,6 +179,7 @@ gptoss-fp4-b200-trt: runner: b200-trt precision: fp4 framework: trt + multinode: false # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true seq-len-configs: - isl: 1024 @@ -204,6 +211,7 @@ gptoss-fp4-b200-vllm: runner: b200 precision: fp4 framework: vllm + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -234,6 +242,7 @@ gptoss-fp4-h100-vllm: runner: h100 precision: fp4 framework: vllm + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -261,6 +270,7 @@ gptoss-fp4-h200-trt: runner: h200 precision: fp4 framework: trt + multinode: false # For all sequence lengths, EP=TP, DP_ATTENTION=false seq-len-configs: - isl: 1024 @@ -292,6 +302,7 @@ gptoss-fp4-h200-vllm: runner: h200 precision: fp4 framework: vllm + multinode: false seq-len-configs: - isl: 1024 osl: 1024 @@ -314,3 +325,464 @@ gptoss-fp4-h200-vllm: - { tp: 2, conc-start: 4, conc-end: 64 } - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 32 } + +dsr1-fp4-gb200-trt: + image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 + model: deepseek-r1-fp4 + model-prefix: dsr1 + runner: gb200 + precision: fp4 + framework: dynamo-trtllm + multinode: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + # NOTE: Prefill tp and ep are always 4 because each GB200 node has 4 GPUs and + # ctx_tp_size is hardcoded to 4 in launch_gb200-nv.sh. Decode tp/ep matches gen_tp_size. + # For 1k/1k: prefill batch-size=4, max-num-tokens=4608 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8, 16, 36 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CTX_MAX_NUM_TOKENS=4608" + - "CTX_MAX_BATCH_SIZE=4" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "GEN_MAX_NUM_TOKENS=128" + - "GEN_MAX_BATCH_SIZE=32" + - "GEN_GPU_MEM_FRACTION=0.9" + - "GEN_MTP_SIZE=3" + - "GEN_EPLB_NUM_SLOTS=0" + + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - spec-decoding: "mtp" + conc-list: [ 512, 1075 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=4608" + - "CTX_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=256" + - "GEN_MAX_BATCH_SIZE=64" + - "GEN_GPU_MEM_FRACTION=0.7" + - "GEN_MTP_SIZE=3" + - "GEN_EPLB_NUM_SLOTS=0" + + - spec-decoding: "mtp" + conc-list: [ 2150 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=4608" + - "CTX_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=256" + - "GEN_MAX_BATCH_SIZE=128" + - "GEN_GPU_MEM_FRACTION=0.7" + - "GEN_MTP_SIZE=1" + - "GEN_EPLB_NUM_SLOTS=0" + + - spec-decoding: "mtp" + conc-list: [ 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=4608" + - "CTX_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=64" + - "GEN_MAX_BATCH_SIZE=16" + - "GEN_GPU_MEM_FRACTION=0.6" + - "GEN_MTP_SIZE=3" + - "GEN_EPLB_NUM_SLOTS=0" + + - spec-decoding: "mtp" + conc-list: [ 2252 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=4608" + - "CTX_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=512" + - "GEN_MAX_BATCH_SIZE=256" + - "GEN_GPU_MEM_FRACTION=0.8" + - "GEN_MTP_SIZE=1" + - "GEN_EPLB_NUM_SLOTS=0" + + # Non-MTP configurations (default spec_decoding="none") + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 141 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CTX_MAX_NUM_TOKENS=4608" + - "CTX_MAX_BATCH_SIZE=4" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "GEN_MAX_NUM_TOKENS=128" + - "GEN_MAX_BATCH_SIZE=128" + - "GEN_GPU_MEM_FRACTION=0.9" + - "GEN_MTP_SIZE=0" + - "GEN_EPLB_NUM_SLOTS=0" + + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - conc-list: [ 1075 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=4608" + - "CTX_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=32" + - "GEN_MAX_BATCH_SIZE=32" + - "GEN_GPU_MEM_FRACTION=0.7" + - "GEN_MTP_SIZE=0" + - "GEN_EPLB_NUM_SLOTS=0" + + - conc-list: [ 1075 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=4608" + - "CTX_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=64" + - "GEN_MAX_BATCH_SIZE=64" + - "GEN_GPU_MEM_FRACTION=0.75" + - "GEN_MTP_SIZE=0" + - "GEN_EPLB_NUM_SLOTS=0" + + - conc-list: [ 2048, 4300 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=4608" + - "CTX_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=256" + - "GEN_MAX_BATCH_SIZE=256" + - "GEN_GPU_MEM_FRACTION=0.75" + - "GEN_MTP_SIZE=0" + - "GEN_EPLB_NUM_SLOTS=0" + + - conc-list: [ 4300 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=4608" + - "CTX_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=512" + - "GEN_MAX_BATCH_SIZE=512" + - "GEN_GPU_MEM_FRACTION=0.8" + - "GEN_MTP_SIZE=0" + - "GEN_EPLB_NUM_SLOTS=0" + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations (spec_decoding="mtp") + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + # For 8k/1k: prefill batch-size=1, max-num-tokens=8448 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8, 18 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CTX_MAX_NUM_TOKENS=8448" + - "CTX_MAX_BATCH_SIZE=1" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "GEN_MAX_NUM_TOKENS=64" + - "GEN_MAX_BATCH_SIZE=16" + - "GEN_GPU_MEM_FRACTION=0.9" + - "GEN_MTP_SIZE=3" + - "GEN_EPLB_NUM_SLOTS=0" + + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - spec-decoding: "mtp" + conc-list: [ 128, 269 ] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=8448" + - "CTX_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=32" + - "GEN_MAX_BATCH_SIZE=8" + - "GEN_GPU_MEM_FRACTION=0.7" + - "GEN_MTP_SIZE=3" + - "GEN_EPLB_NUM_SLOTS=0" + + - spec-decoding: "mtp" + conc-list: [ 538 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=8448" + - "CTX_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=64" + - "GEN_MAX_BATCH_SIZE=16" + - "GEN_GPU_MEM_FRACTION=0.7" + - "GEN_MTP_SIZE=3" + - "GEN_EPLB_NUM_SLOTS=0" + + - spec-decoding: "mtp" + conc-list: [ 1075 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=8448" + - "CTX_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=256" + - "GEN_MAX_BATCH_SIZE=64" + - "GEN_GPU_MEM_FRACTION=0.75" + - "GEN_MTP_SIZE=2" + - "GEN_EPLB_NUM_SLOTS=0" + + - spec-decoding: "mtp" + conc-list: [ 2150 ] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=8448" + - "CTX_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=512" + - "GEN_MAX_BATCH_SIZE=256" + - "GEN_GPU_MEM_FRACTION=0.8" + - "GEN_MTP_SIZE=1" + - "GEN_EPLB_NUM_SLOTS=0" + + # Non-MTP configurations (default spec_decoding="none") + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + - conc-list: [ 1, 2, 4, 8, 16, 34 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "CTX_MAX_NUM_TOKENS=8448" + - "CTX_MAX_BATCH_SIZE=1" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "GEN_MAX_NUM_TOKENS=32" + - "GEN_MAX_BATCH_SIZE=32" + - "GEN_GPU_MEM_FRACTION=0.9" + - "GEN_MTP_SIZE=0" + - "GEN_EPLB_NUM_SLOTS=0" + + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - conc-list: [ 256, 538 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=8448" + - "CTX_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=16" + - "GEN_MAX_BATCH_SIZE=16" + - "GEN_GPU_MEM_FRACTION=0.7" + - "GEN_MTP_SIZE=0" + - "GEN_EPLB_NUM_SLOTS=0" + + - conc-list: [ 1075 ] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=8448" + - "CTX_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=64" + - "GEN_MAX_BATCH_SIZE=64" + - "GEN_GPU_MEM_FRACTION=0.75" + - "GEN_MTP_SIZE=0" + - "GEN_EPLB_NUM_SLOTS=0" + + - conc-list: [ 2150 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=8448" + - "CTX_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=128" + - "GEN_MAX_BATCH_SIZE=128" + - "GEN_GPU_MEM_FRACTION=0.75" + - "GEN_MTP_SIZE=0" + - "GEN_EPLB_NUM_SLOTS=0" + + - conc-list: [ 2150 ] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "CTX_MAX_NUM_TOKENS=8448" + - "CTX_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "GEN_MAX_NUM_TOKENS=256" + - "GEN_MAX_BATCH_SIZE=256" + - "GEN_GPU_MEM_FRACTION=0.8" + - "GEN_MTP_SIZE=0" + - "GEN_EPLB_NUM_SLOTS=0" diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index f0d9a4390..37c567279 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -1,35 +1,8 @@ import json import yaml import argparse -from pydantic import BaseModel, Field, ValidationError, ConfigDict -from typing import List - -# Field name constants -# Top-level config fields -FIELD_IMAGE = 'image' -FIELD_MODEL = 'model' -FIELD_MODEL_PREFIX = 'model-prefix' -FIELD_PRECISION = 'precision' -FIELD_FRAMEWORK = 'framework' -FIELD_RUNNER = 'runner' -FIELD_SEQ_LEN_CONFIGS = 'seq-len-configs' - -# Seq-len-config fields -FIELD_ISL = 'isl' -FIELD_OSL = 'osl' -FIELD_SEARCH_SPACE = 'search-space' - -# Search-space/benchmark fields -FIELD_TP = 'tp' -FIELD_CONC_START = 'conc-start' -FIELD_CONC_END = 'conc-end' -FIELD_EP = 'ep' -FIELD_DP_ATTN = 'dp-attn' - -# Matrix entry fields -FIELD_CONC = 'conc' -FIELD_MAX_MODEL_LEN = 'max-model-len' -FIELD_EXP_NAME = 'exp-name' + +from validation import validate_master_config, validate_matrix_output, validate_runner_config, Fields seq_len_stoi = { "1k1k": (1024, 1024), @@ -50,126 +23,7 @@ def seq_len_to_str(isl: int, osl: int) -> str: return seq_len_itos.get((isl, osl), f"{isl}_{osl}") -class MatrixEntry(BaseModel): - """Pydantic model for validating matrix entry structure.""" - model_config = ConfigDict(extra='forbid', populate_by_name=True) - - image: str - model: str - precision: str - framework: str - runner: str - isl: int - osl: int - tp: int - ep: int - dp_attn: bool = Field(alias='dp-attn') - conc: int - max_model_len: int = Field(alias='max-model-len') - exp_name: str = Field(alias='exp-name') - - -def validate_matrix_output(matrix_values: List[dict]) -> List[dict]: - """Validate that matrix_values entries match the expected structure. - - Raises ValueError if any entry fails validation. - Returns the original list if all entries are valid. - """ - for i, entry in enumerate(matrix_values): - try: - MatrixEntry(**entry) - except ValidationError as e: - raise ValueError(f"Matrix entry at index {i} failed validation:\n{e}") - return matrix_values - - -def validate_master_configs_structure(all_config_data): - """Validate the structure of all master config entries. - - This validates that all required fields are present, have correct types, - and no extra fields exist. Should be called once after loading config files. - """ - for key, val in all_config_data.items(): - # Check for required top-level fields and their types - required_fields = { - FIELD_IMAGE: str, - FIELD_MODEL: str, - FIELD_MODEL_PREFIX: str, - FIELD_PRECISION: str, - FIELD_FRAMEWORK: str, - FIELD_RUNNER: str, - FIELD_SEQ_LEN_CONFIGS: list - } - - for field, expected_type in required_fields.items(): - if field not in val or val[field] is None: - raise ValueError( - f"Missing required field '{field}' for key '{key}'") - if not isinstance(val[field], expected_type): - raise ValueError( - f"Field '{field}' must be {expected_type.__name__} for key '{key}', got {type(val[field]).__name__}") - - seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS] - if len(seq_len_configs) == 0: - raise ValueError( - f"'{FIELD_SEQ_LEN_CONFIGS}' must be a non-empty list for key '{key}'") - - # Validate each seq-len-config - for i, seq_config in enumerate(seq_len_configs): - # Check isl - if FIELD_ISL not in seq_config or seq_config[FIELD_ISL] is None: - raise ValueError( - f"Missing '{FIELD_ISL}' in seq-len-config[{i}] for key '{key}'") - if not isinstance(seq_config[FIELD_ISL], int): - raise ValueError( - f"'{FIELD_ISL}' must be int in seq-len-config[{i}] for key '{key}'") - - # Check osl - if FIELD_OSL not in seq_config or seq_config[FIELD_OSL] is None: - raise ValueError( - f"Missing '{FIELD_OSL}' in seq-len-config[{i}] for key '{key}'") - if not isinstance(seq_config[FIELD_OSL], int): - raise ValueError( - f"'{FIELD_OSL}' must be int in seq-len-config[{i}] for key '{key}'") - - bmk_space = seq_config.get(FIELD_SEARCH_SPACE) - if not bmk_space or not isinstance(bmk_space, list) or len(bmk_space) == 0: - raise ValueError( - f"Missing or invalid '{FIELD_SEARCH_SPACE}' in seq-len-config[{i}] for key '{key}'") - - # Validate each benchmark in search-space - for j, bmk in enumerate(bmk_space): - # Define allowed fields - allowed_fields = {FIELD_TP, FIELD_CONC_START, - FIELD_CONC_END, FIELD_EP, FIELD_DP_ATTN} - required_bmk_fields = {FIELD_TP: int, - FIELD_CONC_START: int, FIELD_CONC_END: int} - optional_bmk_fields = {FIELD_EP: int, FIELD_DP_ATTN: bool} - - # Check for extra fields - extra_fields = set(bmk.keys()) - allowed_fields - if extra_fields: - raise ValueError( - f"Extra fields {extra_fields} in search-space[{j}] of seq-len-config[{i}] for key '{key}'") - - # Validate required fields - for field, expected_type in required_bmk_fields.items(): - if field not in bmk or bmk[field] is None: - raise ValueError( - f"Missing '{field}' in search-space[{j}] of seq-len-config[{i}] for key '{key}'") - if not isinstance(bmk[field], expected_type): - raise ValueError( - f"'{field}' must be {expected_type.__name__} in search-space[{j}] of seq-len-config[{i}] for key '{key}'") - - # Validate optional fields if they exist - for field, expected_type in optional_bmk_fields.items(): - if field in bmk and bmk[field] is not None: - if not isinstance(bmk[field], expected_type): - raise ValueError( - f"'{field}' must be {expected_type.__name__} in search-space[{j}] of seq-len-config[{i}] for key '{key}'") - - -def generate_full_sweep(args, all_config_data): +def generate_full_sweep(args, all_config_data, runner_data): """Generate full sweep configurations with optional filtering. Supports filtering by model prefix, precision, framework, runner type, and sequence lengths. @@ -181,18 +35,7 @@ def generate_full_sweep(args, all_config_data): """ # Validate runner types if specified if args.runner_type: - if not args.runner_config: - raise ValueError( - "--runner-config is required when --runner-type is specified") - - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - valid_runner_types = set(runner_config.keys()) + valid_runner_types = set(runner_data.keys()) invalid_runners = set(args.runner_type) - valid_runner_types if invalid_runners: raise ValueError( @@ -213,259 +56,183 @@ def generate_full_sweep(args, all_config_data): continue # Filter by precision if specified - if args.precision and val[FIELD_PRECISION] not in args.precision: + if args.precision and val[Fields.PRECISION.value] not in args.precision: continue # Filter by framework if specified - if args.framework and val[FIELD_FRAMEWORK] not in args.framework: + if args.framework and val[Fields.FRAMEWORK.value] not in args.framework: continue # Filter by runner type if specified - if args.runner_type and val[FIELD_RUNNER] not in args.runner_type: + if args.runner_type and val[Fields.RUNNER.value] not in args.runner_type: continue - seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS] - image = val[FIELD_IMAGE] - model = val[FIELD_MODEL] - precision = val[FIELD_PRECISION] - framework = val[FIELD_FRAMEWORK] - runner = val[FIELD_RUNNER] - model_code = val[FIELD_MODEL_PREFIX] + # Check if this is a multinode config + is_multinode = val.get(Fields.MULTINODE.value, False) + + seq_len_configs = val[Fields.SEQ_LEN_CONFIGS.value] + image = val[Fields.IMAGE.value] + model = val[Fields.MODEL.value] + precision = val[Fields.PRECISION.value] + framework = val[Fields.FRAMEWORK.value] + runner = val[Fields.RUNNER.value] + model_code = val[Fields.MODEL_PREFIX.value] for seq_config in seq_len_configs: - isl = seq_config[FIELD_ISL] - osl = seq_config[FIELD_OSL] + isl = seq_config[Fields.ISL.value] + osl = seq_config[Fields.OSL.value] # Filter by sequence lengths if specified if seq_lens_filter and (isl, osl) not in seq_lens_filter: continue - bmk_space = seq_config[FIELD_SEARCH_SPACE] + bmk_space = seq_config[Fields.SEARCH_SPACE.value] if args.test_mode: + # In test mode, skip multinode configs for now + if is_multinode: + continue + # In test mode, use highest TP with lowest concurrency - highest_tp_bmk = max(bmk_space, key=lambda x: x[FIELD_TP]) - tp = highest_tp_bmk[FIELD_TP] - conc = highest_tp_bmk[FIELD_CONC_START] - ep = highest_tp_bmk.get(FIELD_EP) - dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN) + highest_tp_bmk = max( + bmk_space, key=lambda x: x[Fields.TP.value]) + tp = highest_tp_bmk[Fields.TP.value] + conc = highest_tp_bmk[Fields.CONC_START.value] + ep = highest_tp_bmk.get(Fields.EP.value) + dp_attn = highest_tp_bmk.get(Fields.DP_ATTN.value) seq_len_str = seq_len_to_str(isl, osl) entry = { - FIELD_IMAGE: image, - FIELD_MODEL: model, - FIELD_PRECISION: precision, - FIELD_FRAMEWORK: framework, - FIELD_RUNNER: runner, - FIELD_ISL: isl, - FIELD_OSL: osl, - FIELD_TP: tp, - FIELD_EP: 1, # Default - FIELD_DP_ATTN: False, # Default - FIELD_CONC: conc, - FIELD_MAX_MODEL_LEN: isl + osl + 200, - FIELD_EXP_NAME: f"{model_code}_{seq_len_str}", + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.TP.value: tp, + Fields.EP.value: 1, # Default + Fields.DP_ATTN.value: False, # Default + Fields.CONC.value: conc, + Fields.MAX_MODEL_LEN.value: isl + osl + 200, + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", } if ep is not None: - entry[FIELD_EP] = ep + entry[Fields.EP.value] = ep if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn + entry[Fields.DP_ATTN.value] = dp_attn + validate_matrix_output(entry, is_multinode) matrix_values.append(entry) else: # Full sweep mode for bmk in bmk_space: - tp = bmk[FIELD_TP] - conc_start = bmk[FIELD_CONC_START] - conc_end = bmk[FIELD_CONC_END] - ep = bmk.get(FIELD_EP) - dp_attn = bmk.get(FIELD_DP_ATTN) - - conc = conc_start - while conc <= conc_end: + if is_multinode: + # Skip multinode configs when --single-node is specified + if not args.multi_node: + continue + + # Multinode configuration + spec_decoding = bmk.get(Fields.SPEC_DECODING.value) + prefill = bmk[Fields.PREFILL.value] + decode = bmk[Fields.DECODE.value] + + # Get concurrency values (can be list or range) + conc_list = bmk.get(Fields.CONC_LIST.value) + # If it's a list + if conc_list: + conc_values = conc_list + # If it's a range + else: + conc_start = bmk[Fields.CONC_START.value] + conc_end = bmk[Fields.CONC_END.value] + conc_values = [] + conc = conc_start + while conc <= conc_end: + conc_values.append(conc) + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end + + # For multinode, create a single entry with conc as a list seq_len_str = seq_len_to_str(isl, osl) entry = { - FIELD_IMAGE: image, - FIELD_MODEL: model, - FIELD_PRECISION: precision, - FIELD_FRAMEWORK: framework, - FIELD_RUNNER: runner, - FIELD_ISL: isl, - FIELD_OSL: osl, - FIELD_TP: tp, - FIELD_CONC: conc, - FIELD_MAX_MODEL_LEN: isl + osl + 200, - FIELD_EP: 1, # Default - FIELD_DP_ATTN: False, # Default - FIELD_EXP_NAME: f"{model_code}_{seq_len_str}", + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.PREFILL.value: prefill, + Fields.DECODE.value: decode, + Fields.CONC.value: conc_values, # Pass the entire list for multinode + Fields.MAX_MODEL_LEN.value: isl + osl + 200, + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", } - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn + # Add spec_decoding if specified + if spec_decoding is not None: + entry[Fields.SPEC_DECODING.value] = spec_decoding + validate_matrix_output(entry, is_multinode) matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end + elif args.single_node: + # Single-node configuration + tp = bmk[Fields.TP.value] + conc_start = bmk[Fields.CONC_START.value] + conc_end = bmk[Fields.CONC_END.value] + ep = bmk.get(Fields.EP.value) + dp_attn = bmk.get(Fields.DP_ATTN.value) + + conc = conc_start + while conc <= conc_end: + seq_len_str = seq_len_to_str(isl, osl) + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.TP.value: tp, + Fields.CONC.value: conc, + Fields.MAX_MODEL_LEN.value: isl + osl + 200, + Fields.EP.value: 1, # Default + Fields.DP_ATTN.value: False, # Default + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + } + + if ep is not None: + entry[Fields.EP.value] = ep + if dp_attn is not None: + entry[Fields.DP_ATTN.value] = dp_attn + + validate_matrix_output(entry, is_multinode) + matrix_values.append(entry) + + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end if len(matrix_values) == 0: - error_msg = "No configs found matching filters:" - if args.model_prefix: - error_msg += f" model-prefix={args.model_prefix}" - if args.precision: - error_msg += f" precision={args.precision}" - if args.framework: - error_msg += f" framework={args.framework}" - if args.runner_type: - error_msg += f" runner-type={args.runner_type}" - if seq_lens_filter: - error_msg += f" seq-lens={args.seq_lens}" - raise ValueError(error_msg) + raise ValueError("No configs found matching input filters.") return matrix_values -def generate_test_config(args, all_config_data): - """Generate test configurations for a specific key. - - Assumes all_config_data has been validated by validate_config_structure(). - """ - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError as e: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - val = all_config_data.get(args.key) - - if not val: - raise ValueError( - f"Specified key '{args.key}' does not exist in config files.") - - # Extract model code from config - model_code = val[FIELD_MODEL_PREFIX] - - runner_nodes = runner_config.get(val[FIELD_RUNNER], []) - if args.runner_node and args.runner_node not in runner_nodes: - raise ValueError( - f"Runner node '{args.runner_node}' is not compatible with config '{args.key}' which runs on runner type '{val[FIELD_RUNNER]}'. Available runner nodes for this config are '{', '.join(runner_nodes)}'.") - - seq_len_configs = val[FIELD_SEQ_LEN_CONFIGS] - image = val[FIELD_IMAGE] - model = val[FIELD_MODEL] - precision = val[FIELD_PRECISION] - framework = val[FIELD_FRAMEWORK] - # Use default runner or specific runner node if input by user - runner = val[FIELD_RUNNER] if not args.runner_node else args.runner_node - - # Convert seq-lens to set of (isl, osl) tuples for filtering - seq_lens_filter = None - if args.seq_lens: - seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} - - matrix_values = [] - - # Process each sequence length configuration - for seq_config in seq_len_configs: - isl = seq_config[FIELD_ISL] - osl = seq_config[FIELD_OSL] - - # Filter by sequence lengths if specified - if seq_lens_filter and (isl, osl) not in seq_lens_filter: - continue - - bmk_space = seq_config[FIELD_SEARCH_SPACE] - - for bmk in bmk_space: - tp = bmk[FIELD_TP] - conc_start = bmk[FIELD_CONC_START] - conc_end = bmk[FIELD_CONC_END] - ep = bmk.get(FIELD_EP) - dp_attn = bmk.get(FIELD_DP_ATTN) - - # In test mode, only use the lowest concurrency (conc_start) - if args.test_mode: - entry = { - FIELD_IMAGE: image, - FIELD_MODEL: model, - FIELD_PRECISION: precision, - FIELD_FRAMEWORK: framework, - FIELD_RUNNER: runner, - FIELD_ISL: isl, - FIELD_OSL: osl, - FIELD_TP: tp, - FIELD_EP: 1, # Default, - FIELD_DP_ATTN: False, # Default - FIELD_CONC: conc_start, - FIELD_MAX_MODEL_LEN: isl + osl, - FIELD_EXP_NAME: f"{model_code}_test", - } - - # Add optional fields if they exist - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - else: - # Generate entries for each concurrency value in the range - conc = conc_start - while conc <= conc_end: - seq_len_str = seq_len_to_str(isl, osl) - entry = { - FIELD_IMAGE: image, - FIELD_MODEL: model, - FIELD_PRECISION: precision, - FIELD_FRAMEWORK: framework, - FIELD_RUNNER: runner, - FIELD_ISL: isl, - FIELD_OSL: osl, - FIELD_TP: tp, - FIELD_EP: 1, # Default, - FIELD_DP_ATTN: False, # Default - FIELD_CONC: conc, - FIELD_MAX_MODEL_LEN: isl + osl, - FIELD_EXP_NAME: f"{model_code}_{seq_len_str}", - } - - # Add optional fields if they exist - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end - - return matrix_values - - -def generate_runner_model_sweep_config(args, all_config_data): +def generate_runner_model_sweep_config(args, all_config_data, runner_data): """Generate runner-model sweep configurations. Assumes all_config_data has been validated by validate_config_structure(). """ - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError as e: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - runner_nodes = runner_config.get(args.runner_type) + runner_nodes = runner_data.get(args.runner_type) if not runner_nodes: raise ValueError( @@ -473,7 +240,8 @@ def generate_runner_model_sweep_config(args, all_config_data): # Filter runner nodes if filter is specified if args.runner_node_filter: - runner_nodes = [node for node in runner_nodes if args.runner_node_filter in node] + runner_nodes = [ + node for node in runner_nodes if args.runner_node_filter in node] if not runner_nodes: raise ValueError( f"No runner nodes found matching filter '{args.runner_node_filter}' for runner type '{args.runner_type}'.") @@ -481,189 +249,56 @@ def generate_runner_model_sweep_config(args, all_config_data): matrix_values = [] for key, val in all_config_data.items(): # Only consider configs with specified runner - if val[FIELD_RUNNER] != args.runner_type: - continue - - # Get model code for exp_name - model_code = val[FIELD_MODEL_PREFIX] - - # Find 1k1k config - target_config = None - for config in val[FIELD_SEQ_LEN_CONFIGS]: - if config[FIELD_ISL] == 1024 and config[FIELD_OSL] == 1024: - target_config = config - break - - highest_tp_bmk = max(target_config[FIELD_SEARCH_SPACE], key=lambda x: x[FIELD_TP]) - # Since we are just testing, pick the highest TP for this config and just test - # on that TP with the lowest concurrency available - highest_tp = highest_tp_bmk[FIELD_TP] - lowest_conc = highest_tp_bmk[FIELD_CONC_START] - - ep = highest_tp_bmk.get(FIELD_EP) - dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN) - - for node in runner_nodes: - entry = { - FIELD_IMAGE: val[FIELD_IMAGE], - FIELD_MODEL: val[FIELD_MODEL], - FIELD_PRECISION: val[FIELD_PRECISION], - FIELD_FRAMEWORK: val[FIELD_FRAMEWORK], - # Add one entry for each node under specified runner type - FIELD_RUNNER: node, - # Again, just use 1k1k since this is just meant to smoke test all runners - FIELD_ISL: 1024, - FIELD_OSL: 1024, - FIELD_TP: highest_tp, - FIELD_EP: 1, # Default, - FIELD_DP_ATTN: False, # Default - FIELD_CONC: lowest_conc, - FIELD_MAX_MODEL_LEN: 2048, - FIELD_EXP_NAME: f"{model_code}_test", - } - - # Add optional fields if they exist - if ep is not None: - entry[FIELD_EP] = ep - if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn - - matrix_values.append(entry) - - return matrix_values - - -def generate_custom_test(args): - """Generate single 1k1k job for custom inputs. - """ - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError as e: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - found_runner_label = False - for runner_type, runner_nodes in runner_config.items(): - if args.runner_label == runner_type or args.runner_label in runner_nodes: - found_runner_label = True - - if not found_runner_label: - raise ValueError(f"Unable to find specified runner label '{args.runner_label}'.") - - if not runner_nodes: - raise ValueError( - f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") - - return [ - { - FIELD_IMAGE: args.image, - FIELD_MODEL: args.model, - FIELD_PRECISION: args.precision, - FIELD_FRAMEWORK: args.framework, - FIELD_RUNNER: args.runner_label, - # Again, just use 1k1k since this is just meant to smoke test all runners - FIELD_ISL: 1024, - FIELD_OSL: 1024, - FIELD_TP: 8, - FIELD_EP: 1, - FIELD_DP_ATTN: False, - FIELD_CONC: 4, - FIELD_EXP_NAME: args.exp_name, - FIELD_MAX_MODEL_LEN: 2048, - } - ] - - -def generate_runner_sweep_config(args, all_config_data): - """Generate runner sweep configurations. - - Assumes all_config_data has been validated by validate_config_structure(). - """ - try: - with open(args.runner_config, 'r') as f: - runner_config = yaml.safe_load(f) - except FileNotFoundError as e: - raise ValueError( - f"Runner config file '{args.runner_config}' does not exist.") - - if not runner_config.get(args.runner_type): - raise ValueError( - f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") - - - matrix_values = [] - for key, val in all_config_data.items(): - # Only consider configs with specified runner - if not key.startswith(args.model_prefix): - continue - - if not val[FIELD_RUNNER] == args.runner_type: - continue - - # Optionally filter by precision and framework - if (args.precision and val[FIELD_PRECISION] != args.precision) or (args.framework and val[FIELD_FRAMEWORK] != args.framework): + if val[Fields.RUNNER.value] != args.runner_type: continue # Get model code for exp_name - model_code = val[FIELD_MODEL_PREFIX] - - runner_nodes = runner_config.get(val[FIELD_RUNNER]) - if not runner_nodes: - raise ValueError( - f"Runner '{val[FIELD_RUNNER]}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") + model_code = val[Fields.MODEL_PREFIX.value] # Find 1k1k config target_config = None - for config in val[FIELD_SEQ_LEN_CONFIGS]: - if config[FIELD_ISL] == 1024 and config[FIELD_OSL] == 1024: + for config in val[Fields.SEQ_LEN_CONFIGS.value]: + if config[Fields.ISL.value] == 1024 and config[Fields.OSL.value] == 1024: target_config = config break - highest_tp_bmk = max(target_config[FIELD_SEARCH_SPACE], key=lambda x: x[FIELD_TP]) + highest_tp_bmk = max( + target_config[Fields.SEARCH_SPACE.value], key=lambda x: x[Fields.TP.value]) # Since we are just testing, pick the highest TP for this config and just test # on that TP with the lowest concurrency available - highest_tp = highest_tp_bmk[FIELD_TP] - lowest_conc = highest_tp_bmk[FIELD_CONC_START] + highest_tp = highest_tp_bmk[Fields.TP.value] + lowest_conc = highest_tp_bmk[Fields.CONC_START.value] - ep = highest_tp_bmk.get(FIELD_EP) - dp_attn = highest_tp_bmk.get(FIELD_DP_ATTN) + ep = highest_tp_bmk.get(Fields.EP.value) + dp_attn = highest_tp_bmk.get(Fields.DP_ATTN.value) for node in runner_nodes: entry = { - FIELD_IMAGE: val[FIELD_IMAGE], - FIELD_MODEL: val[FIELD_MODEL], - FIELD_PRECISION: val[FIELD_PRECISION], - FIELD_FRAMEWORK: val[FIELD_FRAMEWORK], + Fields.IMAGE.value: val[Fields.IMAGE.value], + Fields.MODEL.value: val[Fields.MODEL.value], + Fields.PRECISION.value: val[Fields.PRECISION.value], + Fields.FRAMEWORK.value: val[Fields.FRAMEWORK.value], # Add one entry for each node under specified runner type - FIELD_RUNNER: node, + Fields.RUNNER.value: node, # Again, just use 1k1k since this is just meant to smoke test all runners - FIELD_ISL: 1024, - FIELD_OSL: 1024, - FIELD_TP: highest_tp, - FIELD_EP: 1, # Default, - FIELD_DP_ATTN: False, # Default - FIELD_CONC: lowest_conc, - FIELD_EXP_NAME: f"{model_code}_test", - FIELD_MAX_MODEL_LEN: 2048, + Fields.ISL.value: 1024, + Fields.OSL.value: 1024, + Fields.TP.value: highest_tp, + Fields.EP.value: 1, # Default, + Fields.DP_ATTN.value: False, # Default + Fields.CONC.value: lowest_conc, + Fields.MAX_MODEL_LEN.value: 2048, + Fields.EXP_NAME.value: f"{model_code}_test", } # Add optional fields if they exist if ep is not None: - entry[FIELD_EP] = ep + entry[Fields.EP.value] = ep if dp_attn is not None: - entry[FIELD_DP_ATTN] = dp_attn + entry[Fields.DP_ATTN.value] = dp_attn matrix_values.append(entry) - if len(matrix_values) == 0: - error_msg = f"No configs found matching model prefix '{args.model_prefix}'" - if args.precision: - error_msg += f", precision '{args.precision}'" - if args.framework: - error_msg += f", framework '{args.framework}'" - raise ValueError(error_msg + ".") - return matrix_values @@ -693,6 +328,18 @@ def load_config_files(config_files): return all_config_data +def load_runner_file(runner_file): + """Load runner configuration file.""" + try: + with open(runner_file, 'r') as f: + runner_config = yaml.safe_load(f) + except FileNotFoundError as e: + raise ValueError( + f"Runner config file '{runner_file}' does not exist.") + + return runner_config + + def main(): # Create parent parser with common arguments parent_parser = argparse.ArgumentParser(add_help=False) @@ -702,6 +349,11 @@ def main(): required=True, help='One or more configuration files (YAML format)' ) + parent_parser.add_argument( + '--runner-config', + required=True, + help='Configuration file holding runner information (YAML format)' + ) # Create main parser parser = argparse.ArgumentParser( @@ -746,11 +398,6 @@ def main(): required=False, help='Runner type(s) to filter by (e.g., h200, h100) (optional, can specify multiple)' ) - full_sweep_parser.add_argument( - '--runner-config', - required=False, - help='Configuration file holding runner information (required if --runner-type is specified)' - ) full_sweep_parser.add_argument( '--seq-lens', nargs='+', @@ -769,53 +416,18 @@ def main(): action='store_true', help='Test mode: only run highest TP with lowest concurrency for each matching config' ) - full_sweep_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - # Subcommand: test-config - test_config_parser = subparsers.add_parser( - 'test-config', - parents=[parent_parser], - add_help=False, - help='Given a config key, run that configuration as specified. Optionally specify --test-mode to only run one parallelism-concurrency pair for the config.' - ) - test_config_parser.add_argument( - '--runner-config', - required=True, - help='Configuration file holding runner information' - ) - test_config_parser.add_argument( - '--key', - required=True, - help='Configuration key to use' - ) - test_config_parser.add_argument( - '--runner-node', - required=False, - help='Specific runner node to use' - ) - test_config_parser.add_argument( - '--seq-lens', - nargs='+', - choices=list(seq_len_stoi.keys()), - required=False, - help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." - ) - test_config_parser.add_argument( - '--step-size', - type=int, - default=2, - help='Step size for concurrency values (default: 2)' + node_type_group = full_sweep_parser.add_mutually_exclusive_group(required=True) + node_type_group.add_argument( + '--single-node', + action='store_true', + help='Only generate single-node configurations' ) - test_config_parser.add_argument( - '--test-mode', + node_type_group.add_argument( + '--multi-node', action='store_true', - help='Generate only the lowest concurrency value for each TP level' + help='Only generate multi-node configurations' ) - test_config_parser.add_argument( + full_sweep_parser.add_argument( '-h', '--help', action='help', help='Show this help message and exit' @@ -833,11 +445,6 @@ def main(): required=True, help='Runner type (e.g., b200-trt, h100)' ) - test_config_parser.add_argument( - '--runner-config', - required=True, - help='Configuration file holding runner information' - ) test_config_parser.add_argument( '--runner-node-filter', required=False, @@ -849,117 +456,23 @@ def main(): help='Show this help message and exit' ) - # Subcommand: runner-sweep - test_config_parser = subparsers.add_parser( - 'runner-sweep', - parents=[parent_parser], - add_help=False, - help='Given a model (and optionally a precision and framework), find all configurations matching the inputs, and run those configurations across all compatible runner nodes. This is meant to validate all runner nodes that should run a particular model can. For instance, this should be used to validate that all runners nodes that should run gptoss-120b actually do so successfully.' - ) - test_config_parser.add_argument( - '--runner-type', - required=True, - help='Runner type (e.g., b200-trt, h100)' - ) - test_config_parser.add_argument( - '--model-prefix', - required=True, - help='Model prefix (e.g., 70b)' - ) - test_config_parser.add_argument( - '--precision', - required=False, - help='Precision to filter by (e.g., fp4) (optional)' - ) - test_config_parser.add_argument( - '--framework', - required=False, - help='Framework to filter by (e.g., trt) (optional)' - ) - test_config_parser.add_argument( - '--runner-config', - required=True, - help='Configuration file holding runner information' - ) - test_config_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - - # Subcommand: custom - test_config_parser = subparsers.add_parser( - 'custom', - parents=[parent_parser], - add_help=False, - help='Enter custom values' - ) - test_config_parser.add_argument( - '--runner-label', - required=True, - help='Label associated with runner on which to launch the corresponding job (e.g., h200, h200-nv_1, etc.)' - ) - test_config_parser.add_argument( - '--image', - required=True, - help='Image to run the benchmark (e.g., openai/gpt-oss-120b)' - ) - test_config_parser.add_argument( - '--model', - required=True, - help='Model to run (e.g., vllm/vllm-openai:latest)' - ) - test_config_parser.add_argument( - '--framework', - required=True, - help='Framework to run on (e.g., vllm, trt, sglang)' - ) - test_config_parser.add_argument( - '--precision', - required=True, - help='Precision to run (e.g., fp4, fp8)' - ) - test_config_parser.add_argument( - '--exp-name', - required=True, - help='Experiment name (e.g., 70b_test)' - ) - test_config_parser.add_argument( - '--runner-config', - required=True, - help='Configuration file holding runner information' - ) - test_config_parser.add_argument( - '-h', '--help', - action='help', - help='Show this help message and exit' - ) - args = parser.parse_args() # Load and validate configuration files all_config_data = load_config_files(args.config_files) - validate_master_configs_structure(all_config_data) + runner_data = load_runner_file(args.runner_config) + validate_master_config(all_config_data) + validate_runner_config(runner_data) # Route to appropriate function based on subcommand if args.command == 'full-sweep': - matrix_values = generate_full_sweep(args, all_config_data) - elif args.command == 'test-config': - matrix_values = generate_test_config(args, all_config_data) + matrix_values = generate_full_sweep(args, all_config_data, runner_data) elif args.command == 'runner-model-sweep': matrix_values = generate_runner_model_sweep_config( - args, all_config_data) - elif args.command == 'runner-sweep': - matrix_values = generate_runner_sweep_config( - args, all_config_data) - elif args.command == 'custom': - matrix_values = generate_custom_test(args) + args, all_config_data, runner_data) else: parser.error(f"Unknown command: {args.command}") - # Validate output before printing - validate_matrix_output(matrix_values) - print(json.dumps(matrix_values)) return matrix_values diff --git a/utils/matrix-logic/get_test_sweep_configs.py b/utils/matrix-logic/get_test_sweep_configs.py deleted file mode 100644 index 4df4a51eb..000000000 --- a/utils/matrix-logic/get_test_sweep_configs.py +++ /dev/null @@ -1,151 +0,0 @@ -import json -import yaml -import sys -import argparse - -seq_len_stoi = { - "1k1k": (1024, 1024), - "1k8k": (1024, 8192), - "8k1k": (8192, 1024) -} - -def main(): - parser = argparse.ArgumentParser( - description='Generate benchmark matrix from a specific configuration key' - ) - parser.add_argument( - '--config-files', - nargs='+', - required=True, - help='One or more configuration files (YAML format)' - ) - parser.add_argument( - '--key', - required=True, - help='Configuration key to use' - ) - parser.add_argument( - '--seq-lens', - nargs='+', - choices=list(seq_len_stoi.keys()), - required=False, - help=f"Sequence length configurations to include: {', '.join(seq_len_stoi.keys())}. If not specified, all sequence lengths are included." - ) - parser.add_argument( - '--step-size', - type=int, - default=2, - help='Step size for concurrency values (default: 2)' - ) - - args = parser.parse_args() - - # Convert seq-lens to set of (isl, osl) tuples for filtering - seq_lens_filter = None - if args.seq_lens: - seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} - - # Load and merge all config files - all_config_data = {} - for config_file in args.config_files: - try: - with open(config_file, 'r') as f: - config_data = yaml.safe_load(f) - assert isinstance(config_data, dict), f"Config file '{config_file}' must contain a dictionary" - - # Check for duplicate keys - duplicate_keys = set(all_config_data.keys()) & set(config_data.keys()) - if duplicate_keys: - raise ValueError( - f"Duplicate configuration keys found in '{config_file}': {', '.join(sorted(duplicate_keys))}" - ) - - all_config_data.update(config_data) - except FileNotFoundError: - raise ValueError(f"Input file '{config_file}' does not exist.") - - # Check if the key exists - if args.key not in all_config_data: - available_keys = ', '.join(sorted(all_config_data.keys())) - raise ValueError( - f"Key '{args.key}' not found in configuration files. " - f"Available keys: {available_keys}" - ) - - val = all_config_data[args.key] - - # Validate required fields - seq_len_configs = val.get('seq-len-configs') - assert seq_len_configs, f"Missing 'seq-len-configs' for key '{args.key}'" - - image = val.get('image') - model = val.get('model') - precision = val.get('precision') - framework = val.get('framework') - runner = val.get('runner') - - assert None not in (image, model, precision, framework, runner), \ - f"Missing required fields (image, model, precision, framework, runner) for key '{args.key}'" - - matrix_values = [] - - # Process each sequence length configuration - for seq_config in seq_len_configs: - isl = seq_config.get('isl') - osl = seq_config.get('osl') - - assert None not in (isl, osl), \ - f"Missing 'isl' or 'osl' in seq-len-config for key '{args.key}'" - - # Filter by sequence lengths if specified - if seq_lens_filter and (isl, osl) not in seq_lens_filter: - continue - - bmk_space = seq_config.get('bmk-space') - assert bmk_space, f"Missing 'bmk-space' in seq-len-config for key '{args.key}'" - - for bmk in bmk_space: - tp = bmk.get('tp') - conc_start = bmk.get('conc-start') - conc_end = bmk.get('conc-end') - ep = bmk.get('ep') - dp_attn = bmk.get('dp-attn') - - assert None not in (tp, conc_start, conc_end), \ - f"Missing 'tp', 'conc-start', or 'conc-end' in bmk-space for key '{args.key}'" - - # Generate entries for each concurrency value in the range - conc = conc_start - while conc <= conc_end: - entry = { - 'image': image, - 'model': model, - 'precision': precision, - 'framework': framework, - 'runner': runner, - 'isl': isl, - 'osl': osl, - 'tp': tp, - 'conc': conc, - 'max-model-len': isl + osl, - } - - # Add optional fields if they exist - if ep is not None: - entry['ep'] = ep - if dp_attn is not None: - entry['dp-attn'] = dp_attn - - matrix_values.append(entry) - - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end - - print(json.dumps(matrix_values)) - return matrix_values - -if __name__ == "__main__": - main() diff --git a/utils/matrix-logic/validation.py b/utils/matrix-logic/validation.py new file mode 100644 index 000000000..9fca9bfa9 --- /dev/null +++ b/utils/matrix-logic/validation.py @@ -0,0 +1,290 @@ +from pydantic import BaseModel, Field, ValidationError, ConfigDict, model_validator +from typing import List, Optional, Union, Literal +from enum import Enum + +import pprint + + +class Fields(Enum): + # Field name constants + # Top-level config fields + IMAGE = 'image' + MODEL = 'model' + MODEL_PREFIX = 'model-prefix' + PRECISION = 'precision' + FRAMEWORK = 'framework' + RUNNER = 'runner' + SEQ_LEN_CONFIGS = 'seq-len-configs' + MULTINODE = 'multinode' + + # Seq-len-config fields + ISL = 'isl' + OSL = 'osl' + SEARCH_SPACE = 'search-space' + + # Search-space/benchmark fields + TP = 'tp' + CONC_START = 'conc-start' + CONC_END = 'conc-end' + CONC_LIST = 'conc-list' + EP = 'ep' + DP_ATTN = 'dp-attn' + + # Multinode-specific fields (when MULTINODE = true) + SPEC_DECODING = 'spec-decoding' + PREFILL = 'prefill' + DECODE = 'decode' + NUM_WORKER = 'num-worker' + BATCH_SIZE = 'batch-size' + MAX_NUM_TOKENS = 'max-num-tokens' + ADDITIONAL_SETTINGS = 'additional-settings' + + # Matrix entry fields + CONC = 'conc' + MAX_MODEL_LEN = 'max-model-len' + EXP_NAME = 'exp-name' + + +class SingleNodeMatrixEntry(BaseModel): + """Pydantic model for validating single node matrix entry structure. + This validates the input that should be expected to .github/workflows/benchmark-tmpl.yml""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + precision: str + framework: str + spec_decoding: Optional[Literal["mtp", "draft_model"]] = Field( + default=None, + alias=Fields.SPEC_DECODING.value + ) + runner: str + isl: int + osl: int + tp: int + ep: int + dp_attn: bool = Field(alias=Fields.DP_ATTN.value) + conc: Union[int, List[int]] + max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value) + exp_name: str = Field(alias=Fields.EXP_NAME.value) + + +class WorkerConfig(BaseModel): + """Pydantic model for validating worker configuration in multinode entries.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + num_worker: int = Field(alias=Fields.NUM_WORKER.value) + tp: int + ep: int + dp_attn: bool = Field(alias=Fields.DP_ATTN.value) + additional_settings: Optional[List[str]] = Field( + default=None, alias=Fields.ADDITIONAL_SETTINGS.value) + + +class MultiNodeMatrixEntry(BaseModel): + """Pydantic model for validating multinode matrix entry structure. + This validates the input that should be expected to .github/workflows/benchmark-multinode-tmpl.yml""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + precision: str + framework: str + spec_decoding: Optional[Literal["mtp", "draft_model"]] = Field( + default=None, + alias=Fields.SPEC_DECODING.value + ) + runner: str + isl: int + osl: int + prefill: WorkerConfig + decode: WorkerConfig + conc: List[int] + max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value) + exp_name: str = Field(alias=Fields.EXP_NAME.value) + + +def validate_matrix_output(entry: dict, is_multinode: bool) -> dict: + """Validate that matrix_values entries match the expected structure. + + Raises ValueError if any entry fails validation. + Returns the original list if all entries are valid. + """ + try: + if is_multinode: + MultiNodeMatrixEntry(**entry) + else: + SingleNodeMatrixEntry(**entry) + except ValidationError as e: + raise ValueError( + f"The following parsed matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}") + return entry + +# Input Master Config Validation + + +def _validate_conc_fields(self): + """Ensure either (conc_start AND conc_end) OR conc_list is provided, but not both.""" + has_range = self.conc_start is not None and self.conc_end is not None + has_list = self.conc_list is not None and len(self.conc_list) > 0 + + if has_range and has_list: + raise ValueError( + f"Cannot specify both '{Fields.CONC_LIST.value}' list and " + f"'{Fields.CONC_START.value}'/'{Fields.CONC_END.value}'. " + "Use either a list or a range, not both." + ) + + if not has_range and not has_list: + raise ValueError( + f"Must specify either '{Fields.CONC_LIST.value}' list or both " + f"'{Fields.CONC_START.value}' and '{Fields.CONC_END.value}'." + ) + + if has_range: + if self.conc_start is None or self.conc_end is None: + raise ValueError( + f"Both '{Fields.CONC_START.value}' and '{Fields.CONC_END.value}' " + "must be provided together." + ) + + if self.conc_start > self.conc_end: + raise ValueError( + f"'{Fields.CONC_START.value}' ({self.conc_start}) must be <= " + f"'{Fields.CONC_END.value}' ({self.conc_end})." + ) + + if has_list: + if not all(x > 0 for x in self.conc_list): + raise ValueError( + f"Input '{Fields.CONC_LIST.value}' entries must be greater than 0." + ) + + return self + + +class SingleNodeSearchSpaceEntry(BaseModel): + """Single node search space configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + tp: int + ep: Optional[int] = None + spec_decoding: Optional[Literal["mtp", "draft_model"] + ] = Field(default=None, alias=Fields.SPEC_DECODING.value) + dp_attn: Optional[bool] = Field( + default=None, alias=Fields.DP_ATTN.value) + conc_start: Optional[int] = Field( + default=None, alias=Fields.CONC_START.value) + conc_end: Optional[int] = Field( + default=None, alias=Fields.CONC_END.value) + conc_list: Optional[List[int]] = Field( + default=None, alias=Fields.CONC_LIST.value) + + @model_validator(mode='after') + def validate_conc_fields(self): + return _validate_conc_fields(self) + + +class MultiNodeSearchSpaceEntry(BaseModel): + """Multinode search space configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + spec_decoding: Optional[Literal["mtp", "draft_model"] + ] = Field(default=None, alias=Fields.SPEC_DECODING.value) + prefill: WorkerConfig + decode: WorkerConfig + conc_start: Optional[int] = Field( + default=None, alias=Fields.CONC_START.value) + conc_end: Optional[int] = Field( + default=None, alias=Fields.CONC_END.value) + conc_list: Optional[List[int]] = Field( + default=None, alias=Fields.CONC_LIST.value) + + @model_validator(mode='after') + def validate_conc_fields(self): + return _validate_conc_fields(self) + + +class SingleNodeSeqLenConfig(BaseModel): + """Single node sequence length configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + isl: int + osl: int + search_space: List[SingleNodeSearchSpaceEntry] = Field( + alias=Fields.SEARCH_SPACE.value) + + +class MultiNodeSeqLenConfig(BaseModel): + """Multinode sequence length configuration.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + isl: int + osl: int + search_space: List[MultiNodeSearchSpaceEntry] = Field( + alias=Fields.SEARCH_SPACE.value) + + +class SingleNodeMasterConfigEntry(BaseModel): + """Top-level single node master configuration entry.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + multinode: Literal[False] + seq_len_configs: List[SingleNodeSeqLenConfig] = Field( + alias=Fields.SEQ_LEN_CONFIGS.value) + + +class MultiNodeMasterConfigEntry(BaseModel): + """Top-level multinode master configuration entry.""" + model_config = ConfigDict(extra='forbid', populate_by_name=True) + + image: str + model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) + precision: str + framework: str + runner: str + multinode: Literal[True] + seq_len_configs: List[MultiNodeSeqLenConfig] = Field( + alias=Fields.SEQ_LEN_CONFIGS.value) + + +def validate_master_config(master_configs: dict) -> List[dict]: + """Validate input master configuration structure.""" + for key, entry in master_configs.items(): + is_multinode = entry.get('multinode', False) + + try: + if is_multinode: + MultiNodeMasterConfigEntry(**entry) + else: + SingleNodeMasterConfigEntry(**entry) + except ValidationError as e: + raise ValueError( + f"Master config entry '{key}' failed validation:\n{e}") + return master_configs + +# Runner Config Validation + +def validate_runner_config(runner_configs: dict) -> List[dict]: + """Validate input master configuration structure.""" + for key, value in runner_configs.items(): + if not isinstance(value, list): + raise ValueError( + f"Runner config entry '{key}' must be a list, got {type(value).__name__}") + + if not all(isinstance(item, str) for item in value): + raise ValueError( + f"Runner config entry '{key}' must contain only strings") + + if not value: + raise ValueError( + f"Runner config entry '{key}' cannot be an empty list") + + return runner_configs From 039cf15888bd9005edd7ecdcd81401329f768334 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 15:43:10 -0600 Subject: [PATCH 02/98] adding new gb200 script --- runners/launch_gb200-nv-copy.sh | 286 ++++++++++++++++++++++++++++++++ 1 file changed, 286 insertions(+) create mode 100755 runners/launch_gb200-nv-copy.sh diff --git a/runners/launch_gb200-nv-copy.sh b/runners/launch_gb200-nv-copy.sh new file mode 100755 index 000000000..bf41e1e12 --- /dev/null +++ b/runners/launch_gb200-nv-copy.sh @@ -0,0 +1,286 @@ +#!/usr/bin/bash + +# This script sets up the environment and launches multi-node benchmarks + + +# Set up environment variables for SLURM +export SLURM_PARTITION="batch" +export SLURM_ACCOUNT="benchmark" +export SLURM_JOB_NAME="benchmark-dynamo.job" + +### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars +if [[ $FRAMEWORK == "dynamo-sglang" ]]; then + export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh" + export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" + export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" +else + SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" + + # Update the IMAGE variable to the squash file + export IMAGE=$SQUASH_FILE + + export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2" + export SERVED_MODEL_NAME="deepseek-r1-fp4" +fi + + +export ISL="$ISL" +export OSL="$OSL" + +### FRAMEWORK_DIFF_IF_STATEMENT #2 - difference in launching jobs +if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then + + # Set up Dynamo repository path + DYNAMO_PATH="/mnt/lustre01/users/sa-shared/benchmarks/dynamo" + PERFORMANCE_SWEEPS_PATH="$DYNAMO_PATH/components/backends/trtllm/performance_sweeps" + + # Overview: + # The Dynamo repository contains the bench_serving repository as a submodule. + # The submit_disagg.sh script, located at $PERFORMANCE_SWEEPS_PATH, orchestrates the entire benchmarking workflow: + # 1. Launches the Dynamo inference service with the specified configuration. + # 2. Waits for the service to become healthy. + # 3. Initiates benchmarking using the bench_serving tools. + # 4. Monitors all jobs until completion. + # 5. Collects and processes the results. + + # Always clone and setup Dynamo + echo "Cloning Dynamo repository..." + rm -rf "$DYNAMO_PATH" + git clone https://github.com/ai-dynamo/dynamo.git "$DYNAMO_PATH" + cd "$DYNAMO_PATH" + git checkout release/0.5.1-rc0.20251105 + git submodule update --init --recursive + + # Navigate to performance sweeps directory + cd "$PERFORMANCE_SWEEPS_PATH" + + # 1. CACHE_TRANSCEIVER_MAX_NUM_TOKENS controls the max_tokens_in_buffer value + # in cache_transceiver_config of TensorRT-LLM context and generation workers. + # Specifically, it is the max number of tokens the transfer buffer can fit. + + # Set up environment variables based on ISL/OSL + if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608 + elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then + export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448 + else + echo "Unsupported ISL/OSL combination: $ISL/$OSL" + exit 1 + fi + + # Generate benchmark configurations based on ISL/OSL and MTP mode + generate_benchmark_configs() { + local isl="$1" + local osl="$2" + local mtp_mode="$3" + + # Usage: + # ./submit_disagg.sh [ctx_num] [gen_num] [gen_tp_size] [gen_batch_size] [gen_max_num_tokens] [gen_gpu_memory_fraction] [gen_eplb_num_slots] [gen_mtp_size] [gen_concurrency_list]" + # MTP Modes: + # mtp=off - Run without Multi-Token Prediction (gen_mtp_size=0) + # mtp=on - Run with Multi-Token Prediction (gen_mtp_size=1,2,3) + # Execution Modes: + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + # dep - Run Data-Expert Parallel mode (attention_dp=true) + # Parameters for tep/dep modes: + # ctx_num: Number of context nodes + # gen_num: Number of generation nodes + # gen_tp_size: Generation tensor parallel size + # gen_batch_size: Generation batch size + # gen_max_num_tokens: Generation max number of tokens + # gen_gpu_memory_fraction: GPU memory fraction (0.7-0.95) + # gen_mtp_size: Multi-Token Prediction size (0 for mtp=off, 1-3 for mtp=on) + # gen_eplb_num_slots: Expert load balancing slots (0, 256, 288) + # gen_concurrency_list: Concurrency values (space-separated, quoted) + + if [ "$isl" = "1024" ] && [ "$osl" = "1024" ]; then + if [ "$mtp_mode" = "on" ]; then + echo "Running 1k/1k MTP=ON configurations" + + ./submit_disagg.sh "mtp=on" "tep" 1 4 8 32 128 "0.9" 3 0 "1 2 4 8 16 36" + + ./submit_disagg.sh "mtp=on" "dep" 1 1 16 64 256 "0.7" 3 0 "512 1075" + + ./submit_disagg.sh "mtp=on" "dep" 2 1 16 128 256 "0.7" 1 0 "2150" + + ./submit_disagg.sh "mtp=on" "dep" 1 1 32 16 64 "0.6" 3 0 "512" + + ./submit_disagg.sh "mtp=on" "dep" 1 1 8 256 512 "0.8" 1 0 "2252" + else + echo "Running 1k/1k MTP=OFF configurations" + + ./submit_disagg.sh "mtp=off" "tep" 1 4 8 128 128 "0.9" 0 0 "1 2 4 8 16 32 64 141" + + ./submit_disagg.sh "mtp=off" "dep" 1 1 32 32 32 "0.7" 0 0 "1075" + + ./submit_disagg.sh "mtp=off" "dep" 1 1 16 64 64 "0.75" 0 0 "1075" + + ./submit_disagg.sh "mtp=off" "dep" 2 1 16 256 256 "0.75" 0 0 "2048 4300" + + ./submit_disagg.sh "mtp=off" "dep" 1 1 8 512 512 "0.8" 0 0 "4300" + fi + elif [ "$isl" = "8192" ] && [ "$osl" = "1024" ]; then + if [ "$mtp_mode" = "on" ]; then + echo "Running 8k/1k MTP=ON configurations" + + ./submit_disagg.sh "mtp=on" "tep" 1 3 8 16 64 "0.9" 3 0 "1 2 4 8 18" + + ./submit_disagg.sh "mtp=on" "dep" 5 1 32 8 32 "0.7" 3 0 "128 269" + + ./submit_disagg.sh "mtp=on" "dep" 8 1 32 16 64 "0.7" 3 0 "538" + + ./submit_disagg.sh "mtp=on" "dep" 8 1 16 64 256 "0.75" 2 0 "1075" + + ./submit_disagg.sh "mtp=on" "dep" 6 1 8 256 512 "0.8" 1 0 "2150" + else + echo "Running 8k/1k MTP=OFF configurations" + + ./submit_disagg.sh "mtp=off" "tep" 1 3 8 32 32 "0.9" 0 0 "1 2 4 8 16 34" + + ./submit_disagg.sh "mtp=off" "dep" 4 1 32 16 16 "0.7" 0 0 "256 538" + + ./submit_disagg.sh "mtp=off" "dep" 6 1 16 64 64 "0.75" 0 0 "1075" + + ./submit_disagg.sh "mtp=off" "dep" 8 1 16 128 128 "0.75" 0 0 "2150" + + ./submit_disagg.sh "mtp=off" "dep" 5 1 8 256 256 "0.8" 0 0 "2150" + fi + else + echo "Unsupported ISL/OSL combination: $isl/$osl" + exit 1 + fi + } + + # Run all benchmark configurations + generate_benchmark_configs "$ISL" "$OSL" "$MTP_MODE" + +else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" + # Set up Dynamo repository path + DYNAMO_PATH="/mnt/lustre01/users/sa-shared/benchmarks/dynamo" + SGL_SLURM_JOBS_PATH="$DYNAMO_PATH/components/backends/sglang/slurm_jobs" + + # Always clone and setup Dynamo + echo "Cloning Dynamo repository..." + rm -rf "$DYNAMO_PATH" + git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git $DYNAMO_PATH + cd "$DYNAMO_PATH" + + # Navigate to corresponding directory + cd "$SGL_SLURM_JOBS_PATH" + + # Set up SGL launch script-specific environment variables + export SLURM_ACCOUNT=$SLURM_ACCOUNT + export SLURM_PARTITION=$SLURM_PARTITION + export TIME_LIMIT="04:00:00" + export MODEL_PATH=$MODEL_PATH + export CONFIG_DIR=$CONFIG_DIR + export CONTAINER_IMAGE=$IMAGE + + # Launch jobs based on ISL/OSL + if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + concurrency_list="1024x2048x4096x4608x4864x4992x5120x5376x5632x6144x8192" + bash ./submit_disagg.sh 6 3 12 1 8 $ISL $OSL $concurrency_list inf + elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then + concurrency_list="128x256x384x448x512x576x1024x2048x4096" + bash ./submit_disagg.sh 12 6 6 1 8 $ISL $OSL $concurrency_list inf + else + echo "Unsupported ISL/OSL combination: $ISL/$OSL" + exit 1 + fi +fi + +# Wait for all jobs to complete +echo "Waiting for all jobs to complete..." +while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do + echo "Jobs still running..." + squeue -u $USER + sleep 60 +done +echo "All jobs completed" + +### FRAMEWORK_DIFF_IF_STATEMENT #3 - difference in log post-processing +if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then + + # Find the logs directory (should be only one for this ISL/OSL combination) + LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) + if [ -z "$LOGS_DIR" ]; then + echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" + exit 1 + fi + + echo "Found logs directory: $LOGS_DIR" + + # Find all result subdirectories in this logs directory + RESULT_SUBDIRS=$(find "$LOGS_DIR" -name "ctx*_gen*_[td]ep*_batch*_eplb*_mtp*" -type d) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "No result subdirectories found in $LOGS_DIR" + exit 1 + fi + + echo "Found result subdirectories:" + echo "$RESULT_SUBDIRS" + + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Process individual concurrency result files + RESULTS_SUBDIR="$result_subdir/results" + + if [ -d "$RESULTS_SUBDIR" ]; then + echo "Processing results from: $RESULTS_SUBDIR" + + # Find all concurrency result files with new format + CONCURRENCY_FILES=$(find "$RESULTS_SUBDIR" -name "results_concurrency_*_gpus_*.json") + + for result_file in $CONCURRENCY_FILES; do + if [ -f "$result_file" ]; then + # Extract concurrency and GPU count from filename + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed 's/results_concurrency_\([0-9]*\)_gpus_.*\.json/\1/') + gpus=$(echo "$filename" | sed 's/results_concurrency_.*_gpus_\([0-9]*\)\.json/\1/') + echo "Processing concurrency $concurrency with $gpus GPUs: $result_file" + + # Copy the result file to workspace with a unique name + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus${gpus}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + else + echo "Results subdirectory not found: $RESULTS_SUBDIR" + fi + done + +else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement + # Find the latest log directory + # we do "tail -1" here since only the latest job will yield the result + LOGS_DIR=$(find logs/*/vllm_isl_${ISL}_osl_${OSL} -type d | sort -V | tail -1) + if [ -z "$LOGS_DIR" ]; then + echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" + exit 1 + fi + + echo "Found logs directory: $LOGS_DIR" + ls $LOGS_DIR + + # Result JSON are contained within the result directory + for result_file in $(find $LOGS_DIR -type f); do + # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json + file_name=$(basename $result_file) + if [ -f $result_file ]; then + # Copy the result file to workspace with a unique name + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" + echo "Found result file ${result_file}. Copying them to ${WORKSPACE_RESULT_FILE}" + cp $result_file $WORKSPACE_RESULT_FILE + fi + done +fi + +echo "All result files processed" From f0d851d5a79e4c9023676704a8657ff26db702e4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 16:10:30 -0600 Subject: [PATCH 03/98] adding integration to gb200 runner script and workflow files --- .github/configs/nvidia-master.yaml | 280 +++++++++--------- .../workflows/benchmark-multinode-tmpl.yml | 60 +++- .../workflows/full-sweep-1k1k-scheduler.yml | 175 +++++------ runners/launch_gb200-nv-copy.sh | 184 ++++-------- 4 files changed, 328 insertions(+), 371 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f83228c85..abcbc7544 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -351,19 +351,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: false additional-settings: - - "CTX_MAX_NUM_TOKENS=4608" - - "CTX_MAX_BATCH_SIZE=4" + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false additional-settings: - - "GEN_MAX_NUM_TOKENS=128" - - "GEN_MAX_BATCH_SIZE=32" - - "GEN_GPU_MEM_FRACTION=0.9" - - "GEN_MTP_SIZE=3" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=128" + - "DECODE_MAX_BATCH_SIZE=32" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=3" + - "DECODE_EPLB_NUM_SLOTS=0" # dep - Run Data-Expert Parallel mode (attention_dp=true) - spec-decoding: "mtp" @@ -374,19 +374,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=4608" - - "CTX_MAX_BATCH_SIZE=4" + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=256" - - "GEN_MAX_BATCH_SIZE=64" - - "GEN_GPU_MEM_FRACTION=0.7" - - "GEN_MTP_SIZE=3" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=3" + - "DECODE_EPLB_NUM_SLOTS=0" - spec-decoding: "mtp" conc-list: [ 2150 ] @@ -396,19 +396,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=4608" - - "CTX_MAX_BATCH_SIZE=4" + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=256" - - "GEN_MAX_BATCH_SIZE=128" - - "GEN_GPU_MEM_FRACTION=0.7" - - "GEN_MTP_SIZE=1" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=1" + - "DECODE_EPLB_NUM_SLOTS=0" - spec-decoding: "mtp" conc-list: [ 512 ] @@ -418,19 +418,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=4608" - - "CTX_MAX_BATCH_SIZE=4" + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=64" - - "GEN_MAX_BATCH_SIZE=16" - - "GEN_GPU_MEM_FRACTION=0.6" - - "GEN_MTP_SIZE=3" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.6" + - "DECODE_MTP_SIZE=3" + - "DECODE_EPLB_NUM_SLOTS=0" - spec-decoding: "mtp" conc-list: [ 2252 ] @@ -440,19 +440,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=4608" - - "CTX_MAX_BATCH_SIZE=4" + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=512" - - "GEN_MAX_BATCH_SIZE=256" - - "GEN_GPU_MEM_FRACTION=0.8" - - "GEN_MTP_SIZE=1" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=512" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=1" + - "DECODE_EPLB_NUM_SLOTS=0" # Non-MTP configurations (default spec_decoding="none") # tep - Run Tensor-Expert Parallel mode (attention_dp=false) @@ -463,19 +463,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: false additional-settings: - - "CTX_MAX_NUM_TOKENS=4608" - - "CTX_MAX_BATCH_SIZE=4" + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false additional-settings: - - "GEN_MAX_NUM_TOKENS=128" - - "GEN_MAX_BATCH_SIZE=128" - - "GEN_GPU_MEM_FRACTION=0.9" - - "GEN_MTP_SIZE=0" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=128" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=0" + - "DECODE_EPLB_NUM_SLOTS=0" # dep - Run Data-Expert Parallel mode (attention_dp=true) - conc-list: [ 1075 ] @@ -485,19 +485,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=4608" - - "CTX_MAX_BATCH_SIZE=4" + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=32" - - "GEN_MAX_BATCH_SIZE=32" - - "GEN_GPU_MEM_FRACTION=0.7" - - "GEN_MTP_SIZE=0" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=32" + - "DECODE_MAX_BATCH_SIZE=32" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=0" + - "DECODE_EPLB_NUM_SLOTS=0" - conc-list: [ 1075 ] prefill: @@ -506,19 +506,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=4608" - - "CTX_MAX_BATCH_SIZE=4" + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=64" - - "GEN_MAX_BATCH_SIZE=64" - - "GEN_GPU_MEM_FRACTION=0.75" - - "GEN_MTP_SIZE=0" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" + - "DECODE_EPLB_NUM_SLOTS=0" - conc-list: [ 2048, 4300 ] prefill: @@ -527,19 +527,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=4608" - - "CTX_MAX_BATCH_SIZE=4" + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=256" - - "GEN_MAX_BATCH_SIZE=256" - - "GEN_GPU_MEM_FRACTION=0.75" - - "GEN_MTP_SIZE=0" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" + - "DECODE_EPLB_NUM_SLOTS=0" - conc-list: [ 4300 ] prefill: @@ -548,19 +548,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=4608" - - "CTX_MAX_BATCH_SIZE=4" + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=512" - - "GEN_MAX_BATCH_SIZE=512" - - "GEN_GPU_MEM_FRACTION=0.8" - - "GEN_MTP_SIZE=0" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=512" + - "DECODE_MAX_BATCH_SIZE=512" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=0" + - "DECODE_EPLB_NUM_SLOTS=0" - isl: 8192 osl: 1024 @@ -576,19 +576,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: false additional-settings: - - "CTX_MAX_NUM_TOKENS=8448" - - "CTX_MAX_BATCH_SIZE=1" + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" decode: num-worker: 3 tp: 8 ep: 8 dp-attn: false additional-settings: - - "GEN_MAX_NUM_TOKENS=64" - - "GEN_MAX_BATCH_SIZE=16" - - "GEN_GPU_MEM_FRACTION=0.9" - - "GEN_MTP_SIZE=3" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=3" + - "DECODE_EPLB_NUM_SLOTS=0" # dep - Run Data-Expert Parallel mode (attention_dp=true) - spec-decoding: "mtp" @@ -599,19 +599,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=8448" - - "CTX_MAX_BATCH_SIZE=1" + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=32" - - "GEN_MAX_BATCH_SIZE=8" - - "GEN_GPU_MEM_FRACTION=0.7" - - "GEN_MTP_SIZE=3" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=32" + - "DECODE_MAX_BATCH_SIZE=8" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=3" + - "DECODE_EPLB_NUM_SLOTS=0" - spec-decoding: "mtp" conc-list: [ 538 ] @@ -621,19 +621,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=8448" - - "CTX_MAX_BATCH_SIZE=1" + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=64" - - "GEN_MAX_BATCH_SIZE=16" - - "GEN_GPU_MEM_FRACTION=0.7" - - "GEN_MTP_SIZE=3" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=3" + - "DECODE_EPLB_NUM_SLOTS=0" - spec-decoding: "mtp" conc-list: [ 1075 ] @@ -643,19 +643,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=8448" - - "CTX_MAX_BATCH_SIZE=1" + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=256" - - "GEN_MAX_BATCH_SIZE=64" - - "GEN_GPU_MEM_FRACTION=0.75" - - "GEN_MTP_SIZE=2" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=2" + - "DECODE_EPLB_NUM_SLOTS=0" - spec-decoding: "mtp" conc-list: [ 2150 ] @@ -665,19 +665,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=8448" - - "CTX_MAX_BATCH_SIZE=1" + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=512" - - "GEN_MAX_BATCH_SIZE=256" - - "GEN_GPU_MEM_FRACTION=0.8" - - "GEN_MTP_SIZE=1" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=512" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=1" + - "DECODE_EPLB_NUM_SLOTS=0" # Non-MTP configurations (default spec_decoding="none") # tep - Run Tensor-Expert Parallel mode (attention_dp=false) @@ -688,19 +688,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: false additional-settings: - - "CTX_MAX_NUM_TOKENS=8448" - - "CTX_MAX_BATCH_SIZE=1" + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" decode: num-worker: 3 tp: 8 ep: 8 dp-attn: false additional-settings: - - "GEN_MAX_NUM_TOKENS=32" - - "GEN_MAX_BATCH_SIZE=32" - - "GEN_GPU_MEM_FRACTION=0.9" - - "GEN_MTP_SIZE=0" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=32" + - "DECODE_MAX_BATCH_SIZE=32" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=0" + - "DECODE_EPLB_NUM_SLOTS=0" # dep - Run Data-Expert Parallel mode (attention_dp=true) - conc-list: [ 256, 538 ] @@ -710,19 +710,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=8448" - - "CTX_MAX_BATCH_SIZE=1" + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=16" - - "GEN_MAX_BATCH_SIZE=16" - - "GEN_GPU_MEM_FRACTION=0.7" - - "GEN_MTP_SIZE=0" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=16" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=0" + - "DECODE_EPLB_NUM_SLOTS=0" - conc-list: [ 1075 ] prefill: @@ -731,19 +731,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=8448" - - "CTX_MAX_BATCH_SIZE=1" + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=64" - - "GEN_MAX_BATCH_SIZE=64" - - "GEN_GPU_MEM_FRACTION=0.75" - - "GEN_MTP_SIZE=0" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" + - "DECODE_EPLB_NUM_SLOTS=0" - conc-list: [ 2150 ] prefill: @@ -752,19 +752,19 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=8448" - - "CTX_MAX_BATCH_SIZE=1" + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=128" - - "GEN_MAX_BATCH_SIZE=128" - - "GEN_GPU_MEM_FRACTION=0.75" - - "GEN_MTP_SIZE=0" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=128" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" + - "DECODE_EPLB_NUM_SLOTS=0" - conc-list: [ 2150 ] prefill: @@ -773,16 +773,16 @@ dsr1-fp4-gb200-trt: ep: 4 dp-attn: true additional-settings: - - "CTX_MAX_NUM_TOKENS=8448" - - "CTX_MAX_BATCH_SIZE=1" + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" decode: num-worker: 1 tp: 8 ep: 8 dp-attn: true additional-settings: - - "GEN_MAX_NUM_TOKENS=256" - - "GEN_MAX_BATCH_SIZE=256" - - "GEN_GPU_MEM_FRACTION=0.8" - - "GEN_MTP_SIZE=0" - - "GEN_EPLB_NUM_SLOTS=0" + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=0" + - "DECODE_EPLB_NUM_SLOTS=0" diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 78adffad0..9742bd466 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -27,16 +27,54 @@ on: osl: required: true type: string + conc: + required: true + type: string + mtp-mode: + required: true + type: string + max-model-len: required: true type: string random-range-ratio: required: false type: string - default: '0.8' - mtp-mode: + default: "0.8" + + prefill-num-worker: required: true type: string + prefill-tp: + required: true + type: string + prefill-ep: + required: true + type: string + prefill-dp-attn: + required: true + type: string + prefill-additional-settings: + required: false + type: string + default: "[]" + + decode-num-worker: + required: true + type: string + decode-tp: + required: true + type: string + decode-ep: + required: true + type: string + decode-dp-attn: + required: true + type: string + decode-additional-settings: + required: false + type: string + default: "[]" env: EXP_NAME: ${{ inputs.exp-name }} @@ -47,8 +85,19 @@ env: OSL: ${{ inputs.osl }} MAX_MODEL_LEN: ${{ inputs.max-model-len }} RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }} + CONC: ${{ inputs.conc }} MTP_MODE: ${{ inputs.mtp-mode }} + PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }} + PREFILL_TP: ${{ inputs.prefill-tp }} + PREFILL_EP: ${{ inputs.prefill-ep }} + PREFILL_DP_ATTN: ${{ inputs.prefill-dp-attn }} + + DECODE_NUM_WORKERS: ${{ inputs.decode-num-worker }} + DECODE_TP: ${{ inputs.decode-tp }} + DECODE_EP: ${{ inputs.decode-ep }} + DECODE_DP_ATTN: ${{ inputs.decode-dp-attn }} + permissions: contents: read @@ -56,7 +105,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 480 - name: '${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} mtp-${{ inputs.mtp-mode }}' + name: "${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} mtp-${{ inputs.mtp-mode }}" steps: - name: Resource cleanup @@ -78,7 +127,10 @@ jobs: RUNNER_NAME: ${{ runner.name }} RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_mtp-${{ env.MTP_MODE }}_${{ runner.name }} run: | - bash ./runners/launch_${RUNNER_NAME%%_*}.sh + # bash ./runners/launch_${RUNNER_NAME%%_*}.sh + set -x + export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} + bash ./runners/launch_gb200-nv-copy.sh # Check if at least one result file was created if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 297119430..9f75335b0 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -1,15 +1,13 @@ -name: "Full Sweep Scheduler - 1k1k" +name: "Debug Test" on: workflow_dispatch: - schedule: - - cron: "0 0 * * *" jobs: get-dsr1-configs: runs-on: ubuntu-latest outputs: - search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }} + multi-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.multi-node-search-space-config }} steps: - name: Checkout code uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 @@ -17,34 +15,34 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - - get-gptoss-configs: - runs-on: ubuntu-latest - outputs: - search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }} - steps: - - name: Checkout code - uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - - - id: get-gptoss-configs - run: | - pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - - benchmark-dsr1: + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --seq-lens 1k1k --model-prefix dsr1) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo $CONFIG_JSON_MULTI_NODE + + # get-gptoss-configs: + # runs-on: ubuntu-latest + # outputs: + # multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }} + # steps: + # - name: Checkout code + # uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + + # - id: get-gptoss-configs + # run: | + # pip install pydantic + # CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --node-type multinode --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) + # echo "multi-node-search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + + benchmark-dsr1-multi-node: needs: get-dsr1-configs - uses: ./.github/workflows/benchmark-tmpl.yml + uses: ./.github/workflows/benchmark-multinode-tmpl.yml name: dsr1 1k1k / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} + config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }} secrets: inherit with: - exp-name: "dsr1_1k1k" isl: 1024 osl: 1024 max-model-len: 2048 @@ -53,98 +51,65 @@ jobs: model: ${{ matrix.config.model }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - benchmark-gptoss: - needs: get-gptoss-configs - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }} - secrets: inherit - with: - exp-name: "gptoss_1k1k" - isl: 1024 - osl: 1024 - max-model-len: 2048 - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} + exp-name: "dsr1_1k1k" conc: ${{ matrix.config.conc }} - - # This is a workaround until we can integrate GB200 into master configs. - benchmark-gb200: - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k1k sweep / - strategy: - fail-fast: false - matrix: - config: - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "off", - } - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "on", - } - - { - "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", - "model": "deepseek-ai/DeepSeek-R1-0528", - "model-prefix": "dsr1", - "precision": "fp8", - "framework": "dynamo-sglang", - "mtp": "off", - } - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_1k1k - isl: 1024 - osl: 1024 - max-model-len: 2048 mtp-mode: ${{ matrix.config.mtp }} + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + + + # benchmark-gptoss-multi-node: + # needs: get-gptoss-configs + # uses: ./.github/workflows/benchmark-multinode-tmpl.yml + # name: gptoss 1k1k / + # strategy: + # fail-fast: false + # matrix: + # config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }} + # secrets: inherit + # with: + # exp-name: "gptoss_1k1k" + # isl: 1024 + # osl: 1024 + # max-model-len: 2048 + # runner: ${{ matrix.config.runner }} + # image: ${{ matrix.config.image }} + # model: ${{ matrix.config.model }} + # framework: ${{ matrix.config.framework }} + # precision: ${{ matrix.config.precision }} + # tp: ${{ matrix.config.tp }} + # ep: ${{ matrix.config.ep }} + # dp-attn: ${{ matrix.config.dp-attn }} + # conc: ${{ matrix.config.conc }} + collect-dsr1-results: - needs: [benchmark-dsr1, benchmark-gb200] + needs: benchmark-dsr1-multi-node if: ${{ always() }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: exp-name: "dsr1_1k1k" - collect-gptoss-results: - needs: benchmark-gptoss - if: ${{ always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "gptoss_1k1k" + # collect-gptoss-results: + # needs: benchmark-gptoss-multi-node + # if: ${{ always() }} + # uses: ./.github/workflows/collect-results.yml + # secrets: inherit + # with: + # exp-name: "gptoss_1k1k" calc-success-rate: - needs: [benchmark-dsr1, benchmark-gptoss, benchmark-gb200] + needs: [collect-dsr1-results] if: ${{ always() }} runs-on: ubuntu-latest diff --git a/runners/launch_gb200-nv-copy.sh b/runners/launch_gb200-nv-copy.sh index bf41e1e12..9ac08c4b6 100755 --- a/runners/launch_gb200-nv-copy.sh +++ b/runners/launch_gb200-nv-copy.sh @@ -2,6 +2,7 @@ # This script sets up the environment and launches multi-node benchmarks +set -x # Set up environment variables for SLURM export SLURM_PARTITION="batch" @@ -28,6 +29,9 @@ fi export ISL="$ISL" export OSL="$OSL" +job_output= +job_id= + ### FRAMEWORK_DIFF_IF_STATEMENT #2 - difference in launching jobs if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then @@ -69,135 +73,71 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then exit 1 fi - # Generate benchmark configurations based on ISL/OSL and MTP mode - generate_benchmark_configs() { - local isl="$1" - local osl="$2" - local mtp_mode="$3" - - # Usage: - # ./submit_disagg.sh [ctx_num] [gen_num] [gen_tp_size] [gen_batch_size] [gen_max_num_tokens] [gen_gpu_memory_fraction] [gen_eplb_num_slots] [gen_mtp_size] [gen_concurrency_list]" - # MTP Modes: - # mtp=off - Run without Multi-Token Prediction (gen_mtp_size=0) - # mtp=on - Run with Multi-Token Prediction (gen_mtp_size=1,2,3) - # Execution Modes: - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - # dep - Run Data-Expert Parallel mode (attention_dp=true) - # Parameters for tep/dep modes: - # ctx_num: Number of context nodes - # gen_num: Number of generation nodes - # gen_tp_size: Generation tensor parallel size - # gen_batch_size: Generation batch size - # gen_max_num_tokens: Generation max number of tokens - # gen_gpu_memory_fraction: GPU memory fraction (0.7-0.95) - # gen_mtp_size: Multi-Token Prediction size (0 for mtp=off, 1-3 for mtp=on) - # gen_eplb_num_slots: Expert load balancing slots (0, 256, 288) - # gen_concurrency_list: Concurrency values (space-separated, quoted) - - if [ "$isl" = "1024" ] && [ "$osl" = "1024" ]; then - if [ "$mtp_mode" = "on" ]; then - echo "Running 1k/1k MTP=ON configurations" - - ./submit_disagg.sh "mtp=on" "tep" 1 4 8 32 128 "0.9" 3 0 "1 2 4 8 16 36" - - ./submit_disagg.sh "mtp=on" "dep" 1 1 16 64 256 "0.7" 3 0 "512 1075" - - ./submit_disagg.sh "mtp=on" "dep" 2 1 16 128 256 "0.7" 1 0 "2150" - - ./submit_disagg.sh "mtp=on" "dep" 1 1 32 16 64 "0.6" 3 0 "512" - - ./submit_disagg.sh "mtp=on" "dep" 1 1 8 256 512 "0.8" 1 0 "2252" - else - echo "Running 1k/1k MTP=OFF configurations" - - ./submit_disagg.sh "mtp=off" "tep" 1 4 8 128 128 "0.9" 0 0 "1 2 4 8 16 32 64 141" - - ./submit_disagg.sh "mtp=off" "dep" 1 1 32 32 32 "0.7" 0 0 "1075" - - ./submit_disagg.sh "mtp=off" "dep" 1 1 16 64 64 "0.75" 0 0 "1075" - - ./submit_disagg.sh "mtp=off" "dep" 2 1 16 256 256 "0.75" 0 0 "2048 4300" - - ./submit_disagg.sh "mtp=off" "dep" 1 1 8 512 512 "0.8" 0 0 "4300" - fi - elif [ "$isl" = "8192" ] && [ "$osl" = "1024" ]; then - if [ "$mtp_mode" = "on" ]; then - echo "Running 8k/1k MTP=ON configurations" - - ./submit_disagg.sh "mtp=on" "tep" 1 3 8 16 64 "0.9" 3 0 "1 2 4 8 18" - - ./submit_disagg.sh "mtp=on" "dep" 5 1 32 8 32 "0.7" 3 0 "128 269" - - ./submit_disagg.sh "mtp=on" "dep" 8 1 32 16 64 "0.7" 3 0 "538" - - ./submit_disagg.sh "mtp=on" "dep" 8 1 16 64 256 "0.75" 2 0 "1075" - - ./submit_disagg.sh "mtp=on" "dep" 6 1 8 256 512 "0.8" 1 0 "2150" - else - echo "Running 8k/1k MTP=OFF configurations" - - ./submit_disagg.sh "mtp=off" "tep" 1 3 8 32 32 "0.9" 0 0 "1 2 4 8 16 34" - - ./submit_disagg.sh "mtp=off" "dep" 4 1 32 16 16 "0.7" 0 0 "256 538" - - ./submit_disagg.sh "mtp=off" "dep" 6 1 16 64 64 "0.75" 0 0 "1075" - - ./submit_disagg.sh "mtp=off" "dep" 8 1 16 128 128 "0.75" 0 0 "2150" - - ./submit_disagg.sh "mtp=off" "dep" 5 1 8 256 256 "0.8" 0 0 "2150" - fi - else - echo "Unsupported ISL/OSL combination: $isl/$osl" - exit 1 - fi - } - - # Run all benchmark configurations - generate_benchmark_configs "$ISL" "$OSL" "$MTP_MODE" - -else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" - # Set up Dynamo repository path - DYNAMO_PATH="/mnt/lustre01/users/sa-shared/benchmarks/dynamo" - SGL_SLURM_JOBS_PATH="$DYNAMO_PATH/components/backends/sglang/slurm_jobs" - - # Always clone and setup Dynamo - echo "Cloning Dynamo repository..." - rm -rf "$DYNAMO_PATH" - git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git $DYNAMO_PATH - cd "$DYNAMO_PATH" - - # Navigate to corresponding directory - cd "$SGL_SLURM_JOBS_PATH" - - # Set up SGL launch script-specific environment variables - export SLURM_ACCOUNT=$SLURM_ACCOUNT - export SLURM_PARTITION=$SLURM_PARTITION - export TIME_LIMIT="04:00:00" - export MODEL_PATH=$MODEL_PATH - export CONFIG_DIR=$CONFIG_DIR - export CONTAINER_IMAGE=$IMAGE - - # Launch jobs based on ISL/OSL - if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - concurrency_list="1024x2048x4096x4608x4864x4992x5120x5376x5632x6144x8192" - bash ./submit_disagg.sh 6 3 12 1 8 $ISL $OSL $concurrency_list inf - elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then - concurrency_list="128x256x384x448x512x576x1024x2048x4096" - bash ./submit_disagg.sh 12 6 6 1 8 $ISL $OSL $concurrency_list inf - else - echo "Unsupported ISL/OSL combination: $ISL/$OSL" - exit 1 + # New stuff + # CONC + # ISL + # OSL + # IMAGE + + # PREFILL_NUM_WORKERS + # PREFILL_TP + # PREFILL_EP + # PREFILL_DP_ATTN + + # DECODE_NUM_WORKERS + # DECODE_TP + # DECODE_EP + # DECODE_DP_ATTN + + # Additional env vars needed + # PREFILL_MAX_NUM_TOKENS + # PREFILL_MAX_BATCH_SIZE + # DECODE_MAX_NUM_TOKENS + # DECODE_MAX_BATCH_SIZE + # DECODE_GPU_MEM_FRACTION + # DECODE_MTP_SIZE + # DECODE_EPLB_NUM_SLOTS + + # For GB200, we use 4 tasks per node. + ntasks_per_node=4 + additional_slurm_args="--time=04:00:00" + + kind=dynamo_disagg + + gen_nodes=$(((DECODE_TP + 3)/4 * DECODE_NUM_WORKERS)) + total_nodes=$((PREFILL_NUM_WORKERS + gen_nodes)) + total_tasks=$((total_nodes * ntasks_per_node)) + + set +x + # 4608 prefill max num toks originally + if [ $ISL == $OSL ]; then + sbatch --nodes=${total_nodes} \ + --ntasks=${total_tasks} \ + --ntasks-per-node=${ntasks_per_node} \ + --segment=${total_nodes} ${additional_slurm_args} \ + benchmark_disagg.slurm \ + ${PREFILL_NUM_WORKERS} ${PREFILL_TP} \ + ${PREFILL_BATCH_SIZE} ${PREFILL_MAX_NUM_TOKENS} \ + ${PREFILL_DP_ATTN} ${DECODE_NUM_WORKERS} \ + ${DECODE_TP} ${DECODE_BATCH_SIZE} \ + ${DECODE_MAX_NUM_TOKENS} ${DECODE_DP_ATTN} \ + ${DECODE_GPU_MEM_FRACTION} ${DECODE_EPLB_NUM_SLOTS} \ + ${DECODE_MTP_SIZE} ${CONC} \ + ${gen_nodes} ${kind} \ + ${MODEL_PATH} ${SERVED_MODEL_NAME} \ + ${IMAGE} ${ISL} ${OSL} + # else + # sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} fi fi # Wait for all jobs to complete -echo "Waiting for all jobs to complete..." +# echo "Waiting for all jobs to complete..." while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do echo "Jobs still running..." - squeue -u $USER + squeue --steps -u $USER sleep 60 done -echo "All jobs completed" ### FRAMEWORK_DIFF_IF_STATEMENT #3 - difference in log post-processing if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then @@ -283,4 +223,4 @@ else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement done fi -echo "All result files processed" +echo "All result files processed" \ No newline at end of file From c967d83cb40a4fa9f47f5381b063de0df6187856 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 16:11:38 -0600 Subject: [PATCH 04/98] revert and correct name of 1k1k scheduler workflow --- .github/workflows/full-sweep-1k1k-scheduler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 9f75335b0..b782b3616 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -1,4 +1,4 @@ -name: "Debug Test" +name: "Full Sweep Scheduler - 1k1k" on: workflow_dispatch: From 1b1b7a4fb3005dc518eb96aab7590deb6b74020f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 16:13:24 -0600 Subject: [PATCH 05/98] adding runners.yaml to workflow invocation --- .github/workflows/full-sweep-1k1k-scheduler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index b782b3616..0018ee5ad 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -15,7 +15,7 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --seq-lens 1k1k --model-prefix dsr1) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo $CONFIG_JSON_MULTI_NODE From 1c03192566399fac2c29babeaaa5b89c5582c2fb Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 16:16:17 -0600 Subject: [PATCH 06/98] toJson on conc since it is now a list --- .github/workflows/full-sweep-1k1k-scheduler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 0018ee5ad..945a6675f 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -52,7 +52,7 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} exp-name: "dsr1_1k1k" - conc: ${{ matrix.config.conc }} + conc: ${{ toJson(matrix.config.conc) }} mtp-mode: ${{ matrix.config.mtp }} prefill-num-worker: ${{ matrix.config.prefill.num-worker }} From 9fa8e9252cf2f123659b63ed7ad8a2790e6fc2c5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 16:21:15 -0600 Subject: [PATCH 07/98] correctly sending conc list to multnode --- .../workflows/benchmark-multinode-tmpl.yml | 4 ++-- .../workflows/full-sweep-1k1k-scheduler.yml | 2 +- runners/launch_gb200-nv-copy.sh | 23 +++++++++++++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 9742bd466..8e3b52b4c 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -27,7 +27,7 @@ on: osl: required: true type: string - conc: + conc-list: required: true type: string mtp-mode: @@ -85,7 +85,7 @@ env: OSL: ${{ inputs.osl }} MAX_MODEL_LEN: ${{ inputs.max-model-len }} RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }} - CONC: ${{ inputs.conc }} + CONC_LIST: ${{ join(fromJson(inputs.conc-list), ' ') }} MTP_MODE: ${{ inputs.mtp-mode }} PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }} diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 945a6675f..cc893cfae 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -52,7 +52,7 @@ jobs: framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} exp-name: "dsr1_1k1k" - conc: ${{ toJson(matrix.config.conc) }} + conc-list: ${{ toJson(matrix.config.conc) }} mtp-mode: ${{ matrix.config.mtp }} prefill-num-worker: ${{ matrix.config.prefill.num-worker }} diff --git a/runners/launch_gb200-nv-copy.sh b/runners/launch_gb200-nv-copy.sh index 9ac08c4b6..4c0e2ea04 100755 --- a/runners/launch_gb200-nv-copy.sh +++ b/runners/launch_gb200-nv-copy.sh @@ -98,6 +98,29 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then # DECODE_MTP_SIZE # DECODE_EPLB_NUM_SLOTS + echo "CONC=$CONC" + echo "ISL=$ISL" + echo "OSL=$OSL" + echo "IMAGE=$IMAGE" + + echo "PREFILL_NUM_WORKERS=$PREFILL_NUM_WORKERS" + echo "PREFILL_TP=$PREFILL_TP" + echo "PREFILL_EP=$PREFILL_EP" + echo "PREFILL_DP_ATTN=$PREFILL_DP_ATTN" + + echo "DECODE_NUM_WORKERS=$DECODE_NUM_WORKERS" + echo "DECODE_TP=$DECODE_TP" + echo "DECODE_EP=$DECODE_EP" + echo "DECODE_DP_ATTN=$DECODE_DP_ATTN" + + echo "PREFILL_MAX_NUM_TOKENS=$PREFILL_MAX_NUM_TOKENS" + echo "PREFILL_BATCH_SIZE=$PREFILL_BATCH_SIZE" + echo "DECODE_MAX_NUM_TOKENS=$DECODE_MAX_NUM_TOKENS" + echo "DECODE_BATCH_SIZE=$DECODE_BATCH_SIZE" + echo "DECODE_GPU_MEM_FRACTION=$DECODE_GPU_MEM_FRACTION" + echo "DECODE_MTP_SIZE=$DECODE_MTP_SIZE" + echo "DECODE_EPLB_NUM_SLOTS=$DECODE_EPLB_NUM_SLOTS" + # For GB200, we use 4 tasks per node. ntasks_per_node=4 additional_slurm_args="--time=04:00:00" From fc77648472775aca5aa1eb75c68bb1c20897ff20 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 16:26:07 -0600 Subject: [PATCH 08/98] hotfix --- .github/workflows/benchmark-multinode-tmpl.yml | 2 +- runners/launch_gb200-nv-copy.sh | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 8e3b52b4c..d8050410c 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -129,7 +129,7 @@ jobs: run: | # bash ./runners/launch_${RUNNER_NAME%%_*}.sh set -x - export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} + export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }} bash ./runners/launch_gb200-nv-copy.sh # Check if at least one result file was created if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then diff --git a/runners/launch_gb200-nv-copy.sh b/runners/launch_gb200-nv-copy.sh index 4c0e2ea04..6418f785e 100755 --- a/runners/launch_gb200-nv-copy.sh +++ b/runners/launch_gb200-nv-copy.sh @@ -29,9 +29,6 @@ fi export ISL="$ISL" export OSL="$OSL" -job_output= -job_id= - ### FRAMEWORK_DIFF_IF_STATEMENT #2 - difference in launching jobs if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then From 06ecd9c0573c9fbfd620e29d6ccb3fd26c09ccb4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 16:28:50 -0600 Subject: [PATCH 09/98] correct env var to MAX batch size --- runners/launch_gb200-nv-copy.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/runners/launch_gb200-nv-copy.sh b/runners/launch_gb200-nv-copy.sh index 6418f785e..4e7f63a7d 100755 --- a/runners/launch_gb200-nv-copy.sh +++ b/runners/launch_gb200-nv-copy.sh @@ -111,9 +111,9 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then echo "DECODE_DP_ATTN=$DECODE_DP_ATTN" echo "PREFILL_MAX_NUM_TOKENS=$PREFILL_MAX_NUM_TOKENS" - echo "PREFILL_BATCH_SIZE=$PREFILL_BATCH_SIZE" + echo "PREFILL_MAX_BATCH_SIZE=$PREFILL_MAX_BATCH_SIZE" echo "DECODE_MAX_NUM_TOKENS=$DECODE_MAX_NUM_TOKENS" - echo "DECODE_BATCH_SIZE=$DECODE_BATCH_SIZE" + echo "DECODE_MAX_BATCH_SIZE=$DECODE_MAX_BATCH_SIZE" echo "DECODE_GPU_MEM_FRACTION=$DECODE_GPU_MEM_FRACTION" echo "DECODE_MTP_SIZE=$DECODE_MTP_SIZE" echo "DECODE_EPLB_NUM_SLOTS=$DECODE_EPLB_NUM_SLOTS" @@ -137,9 +137,9 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then --segment=${total_nodes} ${additional_slurm_args} \ benchmark_disagg.slurm \ ${PREFILL_NUM_WORKERS} ${PREFILL_TP} \ - ${PREFILL_BATCH_SIZE} ${PREFILL_MAX_NUM_TOKENS} \ + ${PREFILL_MAX_BATCH_SIZE} ${PREFILL_MAX_NUM_TOKENS} \ ${PREFILL_DP_ATTN} ${DECODE_NUM_WORKERS} \ - ${DECODE_TP} ${DECODE_BATCH_SIZE} \ + ${DECODE_TP} ${DECODE_MAX_BATCH_SIZE} \ ${DECODE_MAX_NUM_TOKENS} ${DECODE_DP_ATTN} \ ${DECODE_GPU_MEM_FRACTION} ${DECODE_EPLB_NUM_SLOTS} \ ${DECODE_MTP_SIZE} ${CONC} \ From 2401e656c01d9216f02f4831d577b0a1ce9a8d53 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 16:31:11 -0600 Subject: [PATCH 10/98] set -x --- runners/launch_gb200-nv-copy.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/runners/launch_gb200-nv-copy.sh b/runners/launch_gb200-nv-copy.sh index 4e7f63a7d..1bca7a359 100755 --- a/runners/launch_gb200-nv-copy.sh +++ b/runners/launch_gb200-nv-copy.sh @@ -128,7 +128,6 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then total_nodes=$((PREFILL_NUM_WORKERS + gen_nodes)) total_tasks=$((total_nodes * ntasks_per_node)) - set +x # 4608 prefill max num toks originally if [ $ISL == $OSL ]; then sbatch --nodes=${total_nodes} \ From 4104b154fb196c301150e9ab64982d4f92a208f9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 16:38:43 -0600 Subject: [PATCH 11/98] debugging with dynmao fork --- runners/launch_gb200-nv-copy.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_gb200-nv-copy.sh b/runners/launch_gb200-nv-copy.sh index 1bca7a359..33bb6ef10 100755 --- a/runners/launch_gb200-nv-copy.sh +++ b/runners/launch_gb200-nv-copy.sh @@ -48,9 +48,9 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then # Always clone and setup Dynamo echo "Cloning Dynamo repository..." rm -rf "$DYNAMO_PATH" - git clone https://github.com/ai-dynamo/dynamo.git "$DYNAMO_PATH" + git clone https://github.com/cquil11/dynamo.git "$DYNAMO_PATH" cd "$DYNAMO_PATH" - git checkout release/0.5.1-rc0.20251105 + git checkout release/0.5.1-rc0.20251105-cam git submodule update --init --recursive # Navigate to performance sweeps directory From d2a5c9f1d3e6ea7a44b83fa018f50e6870166039 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 16:42:25 -0600 Subject: [PATCH 12/98] debugging with dynmao fork pt 2 --- runners/launch_gb200-nv-copy.sh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/runners/launch_gb200-nv-copy.sh b/runners/launch_gb200-nv-copy.sh index 33bb6ef10..8c11cb0bf 100755 --- a/runners/launch_gb200-nv-copy.sh +++ b/runners/launch_gb200-nv-copy.sh @@ -71,7 +71,7 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then fi # New stuff - # CONC + # CONC_LIST # ISL # OSL # IMAGE @@ -95,7 +95,7 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then # DECODE_MTP_SIZE # DECODE_EPLB_NUM_SLOTS - echo "CONC=$CONC" + echo "CONC_LIST=$CONC_LIST" echo "ISL=$ISL" echo "OSL=$OSL" echo "IMAGE=$IMAGE" @@ -141,7 +141,7 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then ${DECODE_TP} ${DECODE_MAX_BATCH_SIZE} \ ${DECODE_MAX_NUM_TOKENS} ${DECODE_DP_ATTN} \ ${DECODE_GPU_MEM_FRACTION} ${DECODE_EPLB_NUM_SLOTS} \ - ${DECODE_MTP_SIZE} ${CONC} \ + ${DECODE_MTP_SIZE} ${CONC_LIST} \ ${gen_nodes} ${kind} \ ${MODEL_PATH} ${SERVED_MODEL_NAME} \ ${IMAGE} ${ISL} ${OSL} From 9ca96b17fa780dfa6a3cf74bc7723c68884c0358 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 21 Nov 2025 16:47:04 -0600 Subject: [PATCH 13/98] experiment --- .github/workflows/benchmark-multinode-tmpl.yml | 2 +- .github/workflows/full-sweep-1k1k-scheduler.yml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index d8050410c..6e624d625 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -30,7 +30,7 @@ on: conc-list: required: true type: string - mtp-mode: + spec-decoding: required: true type: string diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index cc893cfae..64624fedb 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -53,7 +53,7 @@ jobs: precision: ${{ matrix.config.precision }} exp-name: "dsr1_1k1k" conc-list: ${{ toJson(matrix.config.conc) }} - mtp-mode: ${{ matrix.config.mtp }} + spec-decoding: ${{ matrix.config.spec-decoding }} prefill-num-worker: ${{ matrix.config.prefill.num-worker }} prefill-tp: ${{ matrix.config.prefill.tp }} From 379ccd4821b84efeac197aec72fd118e4ddb4185 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 12:54:09 -0600 Subject: [PATCH 14/98] adding separate script for launching --- .github/configs/nvidia-master.yaml | 2 +- .../workflows/benchmark-multinode-tmpl.yml | 4 +- benchmarks/benchmark_lib.sh | 21 ++ benchmarks/dsr1_fp4_gb200_dynamo_trt_slurm.sh | 124 +++++++++ .../dsr1_fp8_gb200_dynamo_sglang_slurm.sh | 17 ++ runners/launch_gb200-nv-copy-2.sh | 247 ++++++++++++++++++ 6 files changed, 412 insertions(+), 3 deletions(-) create mode 100644 benchmarks/dsr1_fp4_gb200_dynamo_trt_slurm.sh create mode 100644 benchmarks/dsr1_fp8_gb200_dynamo_sglang_slurm.sh create mode 100755 runners/launch_gb200-nv-copy-2.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index abcbc7544..69d62e24f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -332,7 +332,7 @@ dsr1-fp4-gb200-trt: model-prefix: dsr1 runner: gb200 precision: fp4 - framework: dynamo-trtllm + framework: dynamo-trt multinode: true seq-len-configs: - isl: 1024 diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 6e624d625..b5306c33a 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -86,7 +86,7 @@ env: MAX_MODEL_LEN: ${{ inputs.max-model-len }} RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }} CONC_LIST: ${{ join(fromJson(inputs.conc-list), ' ') }} - MTP_MODE: ${{ inputs.mtp-mode }} + SPEC_DECODING: ${{ inputs.spec-decoding }} PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }} PREFILL_TP: ${{ inputs.prefill-tp }} @@ -130,7 +130,7 @@ jobs: # bash ./runners/launch_${RUNNER_NAME%%_*}.sh set -x export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }} - bash ./runners/launch_gb200-nv-copy.sh + bash ./runners/launch_gb200-nv-copy-2.sh # Check if at least one result file was created if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV diff --git a/benchmarks/benchmark_lib.sh b/benchmarks/benchmark_lib.sh index 0458cfb78..a4540dfeb 100644 --- a/benchmarks/benchmark_lib.sh +++ b/benchmarks/benchmark_lib.sh @@ -2,6 +2,27 @@ # Shared benchmarking utilities for InferenceMAX +# Check if required environment variables are set +# Usage: check_env_vars VAR1 VAR2 VAR3 ... +# Exits with code 1 if any variable is not set +check_env_vars() { + local missing_vars=() + + for var_name in "$@"; do + if [[ -z "${!var_name}" ]]; then + missing_vars+=("$var_name") + fi + done + + if [[ ${#missing_vars[@]} -gt 0 ]]; then + echo "Error: The following required environment variables are not set:" + for var in "${missing_vars[@]}"; do + echo " - $var" + done + exit 1 + fi +} + # Wait for server to be ready by polling the health endpoint # All parameters are required # Parameters: diff --git a/benchmarks/dsr1_fp4_gb200_dynamo_trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo_trt_slurm.sh new file mode 100644 index 000000000..f4d8b2dcc --- /dev/null +++ b/benchmarks/dsr1_fp4_gb200_dynamo_trt_slurm.sh @@ -0,0 +1,124 @@ +#!/usr/bin/bash + +set -x + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING \ + PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ + PREFILL_MAX_NUM_TOKENS PREFILL_MAX_BATCH_SIZE DECODE_MAX_NUM_TOKENS \ + DECODE_MAX_BATCH_SIZE DECODE_GPU_MEM_FRACTION DECODE_EPLB_NUM_SLOTS \ + NTASKS_PER_NODE + +if [ "$SPEC_DECODING" == "mtp" ]; then + check_env_vars DECODE_MTP_SIZE +else + DECODE_MTP_SIZE="0" + +PERFORMANCE_SWEEPS_PATH="components/backends/trtllm/performance_sweeps" + +echo "Cloning Dynamo repository..." +git clone https://github.com/cquil11/dynamo.git +cd dynamo +git checkout release/0.5.1-rc0.20251105-cam +git submodule update --init --recursive + +cd "$PERFORMANCE_SWEEPS_PATH" + +# Set up environment variables based on ISL/OSL +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608 +elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then + export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448 +else + echo "Unsupported ISL/OSL combination: $ISL/$OSL" + exit 1 +fi + +kind=dynamo_disagg +additional_slurm_args="--time=04:00:00" + +gen_nodes=$(((DECODE_TP + 3)/4 * DECODE_NUM_WORKERS)) +total_nodes=$((PREFILL_NUM_WORKERS + gen_nodes)) +total_tasks=$((total_nodes * NTASKS_PER_NODE)) + +sbatch --nodes=${total_nodes} \ + --ntasks=${total_tasks} \ + --ntasks-per-node=${ntasks_per_node} \ + --segment=${total_nodes} ${additional_slurm_args} \ + benchmark_disagg.slurm \ + ${PREFILL_NUM_WORKERS} ${PREFILL_TP} \ + ${PREFILL_MAX_BATCH_SIZE} ${PREFILL_MAX_NUM_TOKENS} \ + ${PREFILL_DP_ATTN} ${DECODE_NUM_WORKERS} \ + ${DECODE_TP} ${DECODE_MAX_BATCH_SIZE} \ + ${DECODE_MAX_NUM_TOKENS} ${DECODE_DP_ATTN} \ + ${DECODE_GPU_MEM_FRACTION} ${DECODE_EPLB_NUM_SLOTS} \ + ${DECODE_MTP_SIZE} ${CONC_LIST} \ + ${gen_nodes} ${kind} \ + ${MODEL_PATH} ${SERVED_MODEL_NAME} \ + ${IMAGE} ${ISL} ${OSL} + +# Wait for all jobs to complete +echo "Waiting for all jobs to complete..." +while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do + echo "Jobs still running..." + squeue --steps -u $USER + sleep 30 +done + +# Find the logs directory (should be only one for this ISL/OSL combination) +LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) +if [ -z "$LOGS_DIR" ]; then + echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" + exit 1 +fi + +echo "Found logs directory: $LOGS_DIR" + +# Find all result subdirectories in this logs directory +RESULT_SUBDIRS=$(find "$LOGS_DIR" -name "ctx*_gen*_[td]ep*_batch*_eplb*_mtp*" -type d) + +if [ -z "$RESULT_SUBDIRS" ]; then + echo "No result subdirectories found in $LOGS_DIR" + exit 1 +fi + +echo "Found result subdirectories:" +echo "$RESULT_SUBDIRS" + +# Process results from all configurations +for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Process individual concurrency result files + RESULTS_SUBDIR="$result_subdir/results" + + if [ -d "$RESULTS_SUBDIR" ]; then + echo "Processing results from: $RESULTS_SUBDIR" + + # Find all concurrency result files with new format + CONCURRENCY_FILES=$(find "$RESULTS_SUBDIR" -name "results_concurrency_*_gpus_*.json") + + for result_file in $CONCURRENCY_FILES; do + if [ -f "$result_file" ]; then + # Extract concurrency and GPU count from filename + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed 's/results_concurrency_\([0-9]*\)_gpus_.*\.json/\1/') + gpus=$(echo "$filename" | sed 's/results_concurrency_.*_gpus_\([0-9]*\)\.json/\1/') + echo "Processing concurrency $concurrency with $gpus GPUs: $result_file" + + # Copy the result file to workspace with a unique name + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus${gpus}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + else + echo "Results subdirectory not found: $RESULTS_SUBDIR" + fi +done \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_gb200_dynamo_sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo_sglang_slurm.sh new file mode 100644 index 000000000..a93128d41 --- /dev/null +++ b/benchmarks/dsr1_fp8_gb200_dynamo_sglang_slurm.sh @@ -0,0 +1,17 @@ +#!/usr/bin/bash + +set -x + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars + +SGL_SLURM_JOBS_PATH="components/backends/sglang/slurm_jobs" + +# Always clone and setup Dynamo +echo "Cloning Dynamo repository..." +rm -rf "$DYNAMO_PATH" +git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git $DYNAMO_PATH +cd dynamo +cd "$SGL_SLURM_JOBS_PATH" + diff --git a/runners/launch_gb200-nv-copy-2.sh b/runners/launch_gb200-nv-copy-2.sh new file mode 100755 index 000000000..c2a48e634 --- /dev/null +++ b/runners/launch_gb200-nv-copy-2.sh @@ -0,0 +1,247 @@ +#!/usr/bin/bash + +# This script sets up the environment and launches multi-node benchmarks + +set -x + +# Set up environment variables for SLURM +export SLURM_PARTITION="batch" +export SLURM_ACCOUNT="benchmark" +export SLURM_JOB_NAME="benchmark-dynamo.job" + +### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars +if [[ $FRAMEWORK == "dynamo-sglang" ]]; then + export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh" + export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" + export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" +else + SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" + + # Update the IMAGE variable to the squash file + export IMAGE=$SQUASH_FILE + + export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2" + export SERVED_MODEL_NAME="deepseek-r1-fp4" +fi + + +export ISL="$ISL" +export OSL="$OSL" + +bash benchmarks/"${EXP_NAME%%_*}_${PRECISION}_gb200_${FRAMEWORK}_slurm.sh" + +# ### FRAMEWORK_DIFF_IF_STATEMENT #2 - difference in launching jobs +# if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then + +# # Set up Dynamo repository path +# DYNAMO_PATH="/mnt/lustre01/users/sa-shared/benchmarks/dynamo" +# PERFORMANCE_SWEEPS_PATH="$DYNAMO_PATH/components/backends/trtllm/performance_sweeps" + +# # Overview: +# # The Dynamo repository contains the bench_serving repository as a submodule. +# # The submit_disagg.sh script, located at $PERFORMANCE_SWEEPS_PATH, orchestrates the entire benchmarking workflow: +# # 1. Launches the Dynamo inference service with the specified configuration. +# # 2. Waits for the service to become healthy. +# # 3. Initiates benchmarking using the bench_serving tools. +# # 4. Monitors all jobs until completion. +# # 5. Collects and processes the results. + +# # Always clone and setup Dynamo +# echo "Cloning Dynamo repository..." +# rm -rf "$DYNAMO_PATH" +# git clone https://github.com/cquil11/dynamo.git "$DYNAMO_PATH" +# cd "$DYNAMO_PATH" +# git checkout release/0.5.1-rc0.20251105-cam +# git submodule update --init --recursive + +# # Navigate to performance sweeps directory +# cd "$PERFORMANCE_SWEEPS_PATH" + +# # 1. CACHE_TRANSCEIVER_MAX_NUM_TOKENS controls the max_tokens_in_buffer value +# # in cache_transceiver_config of TensorRT-LLM context and generation workers. +# # Specifically, it is the max number of tokens the transfer buffer can fit. + +# # Set up environment variables based on ISL/OSL +# if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then +# export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608 +# elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then +# export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448 +# else +# echo "Unsupported ISL/OSL combination: $ISL/$OSL" +# exit 1 +# fi + +# # New stuff +# # CONC_LIST +# # ISL +# # OSL +# # IMAGE + +# # PREFILL_NUM_WORKERS +# # PREFILL_TP +# # PREFILL_EP +# # PREFILL_DP_ATTN + +# # DECODE_NUM_WORKERS +# # DECODE_TP +# # DECODE_EP +# # DECODE_DP_ATTN + +# # Additional env vars needed +# # PREFILL_MAX_NUM_TOKENS +# # PREFILL_MAX_BATCH_SIZE +# # DECODE_MAX_NUM_TOKENS +# # DECODE_MAX_BATCH_SIZE +# # DECODE_GPU_MEM_FRACTION +# # DECODE_MTP_SIZE +# # DECODE_EPLB_NUM_SLOTS + +# echo "CONC_LIST=$CONC_LIST" +# echo "ISL=$ISL" +# echo "OSL=$OSL" +# echo "IMAGE=$IMAGE" + +# echo "PREFILL_NUM_WORKERS=$PREFILL_NUM_WORKERS" +# echo "PREFILL_TP=$PREFILL_TP" +# echo "PREFILL_EP=$PREFILL_EP" +# echo "PREFILL_DP_ATTN=$PREFILL_DP_ATTN" + +# echo "DECODE_NUM_WORKERS=$DECODE_NUM_WORKERS" +# echo "DECODE_TP=$DECODE_TP" +# echo "DECODE_EP=$DECODE_EP" +# echo "DECODE_DP_ATTN=$DECODE_DP_ATTN" + +# echo "PREFILL_MAX_NUM_TOKENS=$PREFILL_MAX_NUM_TOKENS" +# echo "PREFILL_MAX_BATCH_SIZE=$PREFILL_MAX_BATCH_SIZE" +# echo "DECODE_MAX_NUM_TOKENS=$DECODE_MAX_NUM_TOKENS" +# echo "DECODE_MAX_BATCH_SIZE=$DECODE_MAX_BATCH_SIZE" +# echo "DECODE_GPU_MEM_FRACTION=$DECODE_GPU_MEM_FRACTION" +# echo "DECODE_MTP_SIZE=$DECODE_MTP_SIZE" +# echo "DECODE_EPLB_NUM_SLOTS=$DECODE_EPLB_NUM_SLOTS" + +# # For GB200, we use 4 tasks per node. +# ntasks_per_node=4 +# additional_slurm_args="--time=04:00:00" + +# kind=dynamo_disagg + +# gen_nodes=$(((DECODE_TP + 3)/4 * DECODE_NUM_WORKERS)) +# total_nodes=$((PREFILL_NUM_WORKERS + gen_nodes)) +# total_tasks=$((total_nodes * ntasks_per_node)) + +# # 4608 prefill max num toks originally +# if [ $ISL == $OSL ]; then +# sbatch --nodes=${total_nodes} \ +# --ntasks=${total_tasks} \ +# --ntasks-per-node=${ntasks_per_node} \ +# --segment=${total_nodes} ${additional_slurm_args} \ +# benchmark_disagg.slurm \ +# ${PREFILL_NUM_WORKERS} ${PREFILL_TP} \ +# ${PREFILL_MAX_BATCH_SIZE} ${PREFILL_MAX_NUM_TOKENS} \ +# ${PREFILL_DP_ATTN} ${DECODE_NUM_WORKERS} \ +# ${DECODE_TP} ${DECODE_MAX_BATCH_SIZE} \ +# ${DECODE_MAX_NUM_TOKENS} ${DECODE_DP_ATTN} \ +# ${DECODE_GPU_MEM_FRACTION} ${DECODE_EPLB_NUM_SLOTS} \ +# ${DECODE_MTP_SIZE} ${CONC_LIST} \ +# ${gen_nodes} ${kind} \ +# ${MODEL_PATH} ${SERVED_MODEL_NAME} \ +# ${IMAGE} ${ISL} ${OSL} +# # else +# # sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} +# fi +# fi + +# # Wait for all jobs to complete +# # echo "Waiting for all jobs to complete..." +# while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do +# echo "Jobs still running..." +# squeue --steps -u $USER +# sleep 60 +# done + +# ### FRAMEWORK_DIFF_IF_STATEMENT #3 - difference in log post-processing +# if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then + +# # Find the logs directory (should be only one for this ISL/OSL combination) +# LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) +# if [ -z "$LOGS_DIR" ]; then +# echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" +# exit 1 +# fi + +# echo "Found logs directory: $LOGS_DIR" + +# # Find all result subdirectories in this logs directory +# RESULT_SUBDIRS=$(find "$LOGS_DIR" -name "ctx*_gen*_[td]ep*_batch*_eplb*_mtp*" -type d) + +# if [ -z "$RESULT_SUBDIRS" ]; then +# echo "No result subdirectories found in $LOGS_DIR" +# exit 1 +# fi + +# echo "Found result subdirectories:" +# echo "$RESULT_SUBDIRS" + +# # Process results from all configurations +# for result_subdir in $RESULT_SUBDIRS; do +# echo "Processing result subdirectory: $result_subdir" + +# # Extract configuration info from directory name +# CONFIG_NAME=$(basename "$result_subdir") + +# # Process individual concurrency result files +# RESULTS_SUBDIR="$result_subdir/results" + +# if [ -d "$RESULTS_SUBDIR" ]; then +# echo "Processing results from: $RESULTS_SUBDIR" + +# # Find all concurrency result files with new format +# CONCURRENCY_FILES=$(find "$RESULTS_SUBDIR" -name "results_concurrency_*_gpus_*.json") + +# for result_file in $CONCURRENCY_FILES; do +# if [ -f "$result_file" ]; then +# # Extract concurrency and GPU count from filename +# filename=$(basename "$result_file") +# concurrency=$(echo "$filename" | sed 's/results_concurrency_\([0-9]*\)_gpus_.*\.json/\1/') +# gpus=$(echo "$filename" | sed 's/results_concurrency_.*_gpus_\([0-9]*\)\.json/\1/') +# echo "Processing concurrency $concurrency with $gpus GPUs: $result_file" + +# # Copy the result file to workspace with a unique name +# WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus${gpus}.json" +# cp "$result_file" "$WORKSPACE_RESULT_FILE" + +# echo "Copied result file to: $WORKSPACE_RESULT_FILE" +# fi +# done +# else +# echo "Results subdirectory not found: $RESULTS_SUBDIR" +# fi +# done + +# else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement +# # Find the latest log directory +# # we do "tail -1" here since only the latest job will yield the result +# LOGS_DIR=$(find logs/*/vllm_isl_${ISL}_osl_${OSL} -type d | sort -V | tail -1) +# if [ -z "$LOGS_DIR" ]; then +# echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" +# exit 1 +# fi + +# echo "Found logs directory: $LOGS_DIR" +# ls $LOGS_DIR + +# # Result JSON are contained within the result directory +# for result_file in $(find $LOGS_DIR -type f); do +# # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json +# file_name=$(basename $result_file) +# if [ -f $result_file ]; then +# # Copy the result file to workspace with a unique name +# WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" +# echo "Found result file ${result_file}. Copying them to ${WORKSPACE_RESULT_FILE}" +# cp $result_file $WORKSPACE_RESULT_FILE +# fi +# done +# fi + +# echo "All result files processed" \ No newline at end of file From cc24ba5474c8c01dcec6604af6ca26aea107019e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 13:04:56 -0600 Subject: [PATCH 15/98] changing filenames --- ...200_dynamo_trt_slurm.sh => dsr1_fp4_gb200_dynamo-trt_slurm.sh} | 0 ...namo_sglang_slurm.sh => dsr1_fp8_gb200_dynamo-sglang_slurm.sh} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename benchmarks/{dsr1_fp4_gb200_dynamo_trt_slurm.sh => dsr1_fp4_gb200_dynamo-trt_slurm.sh} (100%) rename benchmarks/{dsr1_fp8_gb200_dynamo_sglang_slurm.sh => dsr1_fp8_gb200_dynamo-sglang_slurm.sh} (100%) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo_trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh similarity index 100% rename from benchmarks/dsr1_fp4_gb200_dynamo_trt_slurm.sh rename to benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh diff --git a/benchmarks/dsr1_fp8_gb200_dynamo_sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh similarity index 100% rename from benchmarks/dsr1_fp8_gb200_dynamo_sglang_slurm.sh rename to benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh From cfd989023af4d7884d4befee2131f546322f614f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 13:07:15 -0600 Subject: [PATCH 16/98] ntasks per node --- runners/launch_gb200-nv-copy-2.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/runners/launch_gb200-nv-copy-2.sh b/runners/launch_gb200-nv-copy-2.sh index c2a48e634..97d3b1fcd 100755 --- a/runners/launch_gb200-nv-copy-2.sh +++ b/runners/launch_gb200-nv-copy-2.sh @@ -8,6 +8,8 @@ set -x export SLURM_PARTITION="batch" export SLURM_ACCOUNT="benchmark" export SLURM_JOB_NAME="benchmark-dynamo.job" +# For GB200 we have 4 GPUs per node +export NTASKS_PER_NODE=4 ### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars if [[ $FRAMEWORK == "dynamo-sglang" ]]; then From 6cf098c7e89ca29a24f5ed9f0db71038169b44a2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 13:24:41 -0600 Subject: [PATCH 17/98] making the spec-decoding output required --- benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh | 1 + utils/matrix-logic/generate_sweep_configs.py | 9 ++++----- utils/matrix-logic/validation.py | 14 +++++++------- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh index f4d8b2dcc..1f38f3997 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -15,6 +15,7 @@ if [ "$SPEC_DECODING" == "mtp" ]; then check_env_vars DECODE_MTP_SIZE else DECODE_MTP_SIZE="0" +fi PERFORMANCE_SWEEPS_PATH="components/backends/trtllm/performance_sweeps" diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 37c567279..c30e64ff4 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -134,7 +134,9 @@ def generate_full_sweep(args, all_config_data, runner_data): continue # Multinode configuration - spec_decoding = bmk.get(Fields.SPEC_DECODING.value) + # spec_decoding defaults to "none" if not specified + spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none") + prefill = bmk[Fields.PREFILL.value] decode = bmk[Fields.DECODE.value] @@ -167,6 +169,7 @@ def generate_full_sweep(args, all_config_data, runner_data): Fields.RUNNER.value: runner, Fields.ISL.value: isl, Fields.OSL.value: osl, + Fields.SPEC_DECODING.value: spec_decoding, Fields.PREFILL.value: prefill, Fields.DECODE.value: decode, Fields.CONC.value: conc_values, # Pass the entire list for multinode @@ -174,10 +177,6 @@ def generate_full_sweep(args, all_config_data, runner_data): Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", } - # Add spec_decoding if specified - if spec_decoding is not None: - entry[Fields.SPEC_DECODING.value] = spec_decoding - validate_matrix_output(entry, is_multinode) matrix_values.append(entry) elif args.single_node: diff --git a/utils/matrix-logic/validation.py b/utils/matrix-logic/validation.py index 9fca9bfa9..887baafc1 100644 --- a/utils/matrix-logic/validation.py +++ b/utils/matrix-logic/validation.py @@ -46,7 +46,7 @@ class Fields(Enum): class SingleNodeMatrixEntry(BaseModel): - """Pydantic model for validating single node matrix entry structure. + """Pydantic model for validating single node matrix entry structure. This validates the input that should be expected to .github/workflows/benchmark-tmpl.yml""" model_config = ConfigDict(extra='forbid', populate_by_name=True) @@ -54,7 +54,7 @@ class SingleNodeMatrixEntry(BaseModel): model: str precision: str framework: str - spec_decoding: Optional[Literal["mtp", "draft_model"]] = Field( + spec_decoding: Optional[Literal["mtp", "draft_model", "none"]] = Field( default=None, alias=Fields.SPEC_DECODING.value ) @@ -90,8 +90,7 @@ class MultiNodeMatrixEntry(BaseModel): model: str precision: str framework: str - spec_decoding: Optional[Literal["mtp", "draft_model"]] = Field( - default=None, + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( alias=Fields.SPEC_DECODING.value ) runner: str @@ -169,7 +168,7 @@ class SingleNodeSearchSpaceEntry(BaseModel): tp: int ep: Optional[int] = None - spec_decoding: Optional[Literal["mtp", "draft_model"] + spec_decoding: Optional[Literal["mtp", "draft_model", "none"] ] = Field(default=None, alias=Fields.SPEC_DECODING.value) dp_attn: Optional[bool] = Field( default=None, alias=Fields.DP_ATTN.value) @@ -189,8 +188,9 @@ class MultiNodeSearchSpaceEntry(BaseModel): """Multinode search space configuration.""" model_config = ConfigDict(extra='forbid', populate_by_name=True) - spec_decoding: Optional[Literal["mtp", "draft_model"] - ] = Field(default=None, alias=Fields.SPEC_DECODING.value) + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + default="none", + alias=Fields.SPEC_DECODING.value) prefill: WorkerConfig decode: WorkerConfig conc_start: Optional[int] = Field( From 34d8dc3fd46728f5cf5e95b9a6d860df9147a630 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 13:26:34 -0600 Subject: [PATCH 18/98] updating ntasks per node gp --- benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh index 1f38f3997..621c6d05b 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -46,7 +46,7 @@ total_tasks=$((total_nodes * NTASKS_PER_NODE)) sbatch --nodes=${total_nodes} \ --ntasks=${total_tasks} \ - --ntasks-per-node=${ntasks_per_node} \ + --ntasks-per-node=${NTASKS_PER_NODE} \ --segment=${total_nodes} ${additional_slurm_args} \ benchmark_disagg.slurm \ ${PREFILL_NUM_WORKERS} ${PREFILL_TP} \ From bccc2f1c3de1d835d6cfe25a2f023bd0fafcbc22 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 13:42:20 -0600 Subject: [PATCH 19/98] test --- benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh | 35 +++++++++++++++---- 1 file changed, 29 insertions(+), 6 deletions(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh index 621c6d05b..7399eac23 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -60,13 +60,36 @@ sbatch --nodes=${total_nodes} \ ${MODEL_PATH} ${SERVED_MODEL_NAME} \ ${IMAGE} ${ISL} ${OSL} -# Wait for all jobs to complete +# # Wait for all jobs to complete +# echo "Waiting for all jobs to complete..." +# while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do +# echo "Jobs still running..." +# squeue --steps -u $USER +# sleep 30 +# done + echo "Waiting for all jobs to complete..." -while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do - echo "Jobs still running..." - squeue --steps -u $USER - sleep 30 -done +JOB_ID=$(squeue -u $USER --noheader --format='%i' | head -1) + +if [ -n "$JOB_ID" ]; then + # Tail the slurm output file + SLURM_LOG="slurm-${JOB_ID}.out" + echo "Tailing ${SLURM_LOG}..." + + # Wait for log file to appear, then tail it + while [ ! -f "$SLURM_LOG" ]; do + sleep 2 + done + tail -f "$SLURM_LOG" & + TAIL_PID=$! + + # Wait for job to finish + while squeue -j $JOB_ID --noheader 2>/dev/null | grep -q .; do + sleep 30 + done + + kill $TAIL_PID 2>/dev/null +fi # Find the logs directory (should be only one for this ISL/OSL combination) LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) From 09b9ae5e9647c0ac0997cdee96f3ee7139ab717a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 13:44:01 -0600 Subject: [PATCH 20/98] test --- benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh index 7399eac23..12adb2055 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -68,12 +68,13 @@ sbatch --nodes=${total_nodes} \ # sleep 30 # done +# After sbatch submission echo "Waiting for all jobs to complete..." JOB_ID=$(squeue -u $USER --noheader --format='%i' | head -1) if [ -n "$JOB_ID" ]; then - # Tail the slurm output file - SLURM_LOG="slurm-${JOB_ID}.out" + # The slurm log is in the directory where sbatch was executed + SLURM_LOG="dynamo/components/backends/trtllm/performance_sweeps/slurm-${JOB_ID}.out" echo "Tailing ${SLURM_LOG}..." # Wait for log file to appear, then tail it From 56ccdcdac637f4b853df4a5506938b92ab2cd499 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 13:46:51 -0600 Subject: [PATCH 21/98] conc list quoted --- benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh index 12adb2055..e7bbaac77 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -55,7 +55,7 @@ sbatch --nodes=${total_nodes} \ ${DECODE_TP} ${DECODE_MAX_BATCH_SIZE} \ ${DECODE_MAX_NUM_TOKENS} ${DECODE_DP_ATTN} \ ${DECODE_GPU_MEM_FRACTION} ${DECODE_EPLB_NUM_SLOTS} \ - ${DECODE_MTP_SIZE} ${CONC_LIST} \ + ${DECODE_MTP_SIZE} "${CONC_LIST}" \ ${gen_nodes} ${kind} \ ${MODEL_PATH} ${SERVED_MODEL_NAME} \ ${IMAGE} ${ISL} ${OSL} From a81e309fabcd794a09e41492bad9e33c7c82e96b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 14:14:47 -0600 Subject: [PATCH 22/98] get rid of debug code --- benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh | 36 ++++--------------- 1 file changed, 6 insertions(+), 30 deletions(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh index e7bbaac77..ec40af905 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -60,37 +60,13 @@ sbatch --nodes=${total_nodes} \ ${MODEL_PATH} ${SERVED_MODEL_NAME} \ ${IMAGE} ${ISL} ${OSL} -# # Wait for all jobs to complete -# echo "Waiting for all jobs to complete..." -# while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do -# echo "Jobs still running..." -# squeue --steps -u $USER -# sleep 30 -# done - -# After sbatch submission +# Wait for all jobs to complete echo "Waiting for all jobs to complete..." -JOB_ID=$(squeue -u $USER --noheader --format='%i' | head -1) - -if [ -n "$JOB_ID" ]; then - # The slurm log is in the directory where sbatch was executed - SLURM_LOG="dynamo/components/backends/trtllm/performance_sweeps/slurm-${JOB_ID}.out" - echo "Tailing ${SLURM_LOG}..." - - # Wait for log file to appear, then tail it - while [ ! -f "$SLURM_LOG" ]; do - sleep 2 - done - tail -f "$SLURM_LOG" & - TAIL_PID=$! - - # Wait for job to finish - while squeue -j $JOB_ID --noheader 2>/dev/null | grep -q .; do - sleep 30 - done - - kill $TAIL_PID 2>/dev/null -fi +while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do + echo "Jobs still running..." + squeue --steps -u $USER + sleep 30 +done # Find the logs directory (should be only one for this ISL/OSL combination) LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) From 6d818e54c322ede46455695778e5faf36783aa70 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 16:04:00 -0600 Subject: [PATCH 23/98] testing support for dsr1 --- .github/configs/nvidia-master.yaml | 56 ++++++++++++++++++- .../workflows/benchmark-multinode-tmpl.yml | 2 +- .../dsr1_fp4_gb200_dynamo-sglang_slurm.sh | 49 ++++++++++++++++ 3 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 69d62e24f..5735a5d41 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -326,7 +326,7 @@ gptoss-fp4-h200-vllm: - { tp: 4, conc-start: 4, conc-end: 64 } - { tp: 8, conc-start: 4, conc-end: 32 } -dsr1-fp4-gb200-trt: +dsr1-fp4-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 model: deepseek-r1-fp4 model-prefix: dsr1 @@ -786,3 +786,57 @@ dsr1-fp4-gb200-trt: - "DECODE_GPU_MEM_FRACTION=0.8" - "DECODE_MTP_SIZE=0" - "DECODE_EPLB_NUM_SLOTS=0" + +dsr1-fp8-gb200-dynamo-sglang: + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: gb200 + precision: fp8 + framework: dynamo-sglang + multinode: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096, 4608, 4864, 4992, 5120, 5376, 5632, 6144, 8192 ] + prefill: + num-worker: 3 + # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: + # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=6" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=12" + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] + prefill: + num-worker: 6 + # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: + # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=12" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=6" diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index b5306c33a..e9f4acbb5 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -125,7 +125,7 @@ jobs: - name: Launch multi-node job script env: RUNNER_NAME: ${{ runner.name }} - RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_mtp-${{ env.MTP_MODE }}_${{ runner.name }} + RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_specdec-${{ env.SPEC_DECODING }}_${{ runner.name }} run: | # bash ./runners/launch_${RUNNER_NAME%%_*}.sh set -x diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh new file mode 100644 index 000000000..debc3be8f --- /dev/null +++ b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh @@ -0,0 +1,49 @@ + +#!/bin/bash + +set -x + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ + PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ + PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS + +# Always clone and setup Dynamo +echo "Cloning Dynamo repository..." +git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git +cd "dynamo/components/backends/sglang/slurm_jobs" + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="04:00:00" +export MODEL_PATH=$MODEL_PATH +export CONFIG_DIR=$CONFIG_DIR +export CONTAINER_IMAGE=$IMAGE + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimted by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +bash ./submit_disagg.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $DECODE_NUM_WORKERS \ + $N_ADDITIONAL_FRONTENDS \ + $ISL $OSL "${CONC_LIST// /x}" inf + +# if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then +# bash ./submit_disagg.sh $PREFILL_NODES \ +# $PREFILL_NUM_WORKERS \ +# $DECODE_NODES \ +# $DECODE_NUM_WORKERS \ +# $DECODE_NUM_WORKERS \ +# $ISL $OSL "${CONC_LIST// /x}" inf +# elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then +# concurrency_list="128x256x384x448x512x576x1024x2048x4096" +# bash ./submit_disagg.sh 12 6 6 1 8 $ISL $OSL "${CONC_LIST// /x}" inf +# else +# echo "Unsupported ISL/OSL combination: $ISL/$OSL" +# exit 1 +# fi \ No newline at end of file From 9c8a245e3e1a726ea065bb1681d93d085f50537c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 16:05:10 -0600 Subject: [PATCH 24/98] testing support for dsr1 test --- .github/workflows/full-sweep-1k1k-scheduler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 64624fedb..48e89cba9 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -15,7 +15,7 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --framework dynamo-sglang --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo $CONFIG_JSON_MULTI_NODE From 5959a3c403b85939824bba9806ef63963aa963fd Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 16:17:41 -0600 Subject: [PATCH 25/98] testing support for dsr1 test --- .../dsr1_fp4_gb200_dynamo-sglang_slurm.sh | 49 ------------------- .../dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 48 +++++++++++++++--- 2 files changed, 40 insertions(+), 57 deletions(-) delete mode 100644 benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh deleted file mode 100644 index debc3be8f..000000000 --- a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh +++ /dev/null @@ -1,49 +0,0 @@ - -#!/bin/bash - -set -x - -source "$(dirname "$0")/benchmark_lib.sh" - -check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ - PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ - DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ - PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS - -# Always clone and setup Dynamo -echo "Cloning Dynamo repository..." -git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git -cd "dynamo/components/backends/sglang/slurm_jobs" - -# Set up SGL launch script-specific environment variables -export TIME_LIMIT="04:00:00" -export MODEL_PATH=$MODEL_PATH -export CONFIG_DIR=$CONFIG_DIR -export CONTAINER_IMAGE=$IMAGE - -# Launch jobs based on ISL/OSL -# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented -# by a list of numbers delimted by 'x'. This is because of how the underlying launch script -# expects the concurrencies. -bash ./submit_disagg.sh $PREFILL_NODES \ - $PREFILL_NUM_WORKERS \ - $DECODE_NODES \ - $DECODE_NUM_WORKERS \ - $DECODE_NUM_WORKERS \ - $N_ADDITIONAL_FRONTENDS \ - $ISL $OSL "${CONC_LIST// /x}" inf - -# if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then -# bash ./submit_disagg.sh $PREFILL_NODES \ -# $PREFILL_NUM_WORKERS \ -# $DECODE_NODES \ -# $DECODE_NUM_WORKERS \ -# $DECODE_NUM_WORKERS \ -# $ISL $OSL "${CONC_LIST// /x}" inf -# elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then -# concurrency_list="128x256x384x448x512x576x1024x2048x4096" -# bash ./submit_disagg.sh 12 6 6 1 8 $ISL $OSL "${CONC_LIST// /x}" inf -# else -# echo "Unsupported ISL/OSL combination: $ISL/$OSL" -# exit 1 -# fi \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index a93128d41..debc3be8f 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -1,17 +1,49 @@ -#!/usr/bin/bash + +#!/bin/bash set -x source "$(dirname "$0")/benchmark_lib.sh" -check_env_vars - -SGL_SLURM_JOBS_PATH="components/backends/sglang/slurm_jobs" +check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ + PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ + PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS # Always clone and setup Dynamo echo "Cloning Dynamo repository..." -rm -rf "$DYNAMO_PATH" -git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git $DYNAMO_PATH -cd dynamo -cd "$SGL_SLURM_JOBS_PATH" +git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git +cd "dynamo/components/backends/sglang/slurm_jobs" + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="04:00:00" +export MODEL_PATH=$MODEL_PATH +export CONFIG_DIR=$CONFIG_DIR +export CONTAINER_IMAGE=$IMAGE + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimted by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +bash ./submit_disagg.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $DECODE_NUM_WORKERS \ + $N_ADDITIONAL_FRONTENDS \ + $ISL $OSL "${CONC_LIST// /x}" inf +# if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then +# bash ./submit_disagg.sh $PREFILL_NODES \ +# $PREFILL_NUM_WORKERS \ +# $DECODE_NODES \ +# $DECODE_NUM_WORKERS \ +# $DECODE_NUM_WORKERS \ +# $ISL $OSL "${CONC_LIST// /x}" inf +# elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then +# concurrency_list="128x256x384x448x512x576x1024x2048x4096" +# bash ./submit_disagg.sh 12 6 6 1 8 $ISL $OSL "${CONC_LIST// /x}" inf +# else +# echo "Unsupported ISL/OSL combination: $ISL/$OSL" +# exit 1 +# fi \ No newline at end of file From a23315be2cafc9017ed26136baaab480f8d18fec Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 16:19:52 -0600 Subject: [PATCH 26/98] testing support for dsr1 test --- benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index debc3be8f..801355bdd 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -29,7 +29,6 @@ bash ./submit_disagg.sh $PREFILL_NODES \ $PREFILL_NUM_WORKERS \ $DECODE_NODES \ $DECODE_NUM_WORKERS \ - $DECODE_NUM_WORKERS \ $N_ADDITIONAL_FRONTENDS \ $ISL $OSL "${CONC_LIST// /x}" inf From a809af260a18efedb13d1c41f500b64c31bccb99 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 24 Nov 2025 16:22:52 -0600 Subject: [PATCH 27/98] testing support for dsr1 test --- benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh | 66 +------------------ runners/launch_gb200-nv-copy-2.sh | 64 ++++++++++++++++++ 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh index ec40af905..29efb8dbb 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -58,68 +58,4 @@ sbatch --nodes=${total_nodes} \ ${DECODE_MTP_SIZE} "${CONC_LIST}" \ ${gen_nodes} ${kind} \ ${MODEL_PATH} ${SERVED_MODEL_NAME} \ - ${IMAGE} ${ISL} ${OSL} - -# Wait for all jobs to complete -echo "Waiting for all jobs to complete..." -while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do - echo "Jobs still running..." - squeue --steps -u $USER - sleep 30 -done - -# Find the logs directory (should be only one for this ISL/OSL combination) -LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) -if [ -z "$LOGS_DIR" ]; then - echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" - exit 1 -fi - -echo "Found logs directory: $LOGS_DIR" - -# Find all result subdirectories in this logs directory -RESULT_SUBDIRS=$(find "$LOGS_DIR" -name "ctx*_gen*_[td]ep*_batch*_eplb*_mtp*" -type d) - -if [ -z "$RESULT_SUBDIRS" ]; then - echo "No result subdirectories found in $LOGS_DIR" - exit 1 -fi - -echo "Found result subdirectories:" -echo "$RESULT_SUBDIRS" - -# Process results from all configurations -for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Process individual concurrency result files - RESULTS_SUBDIR="$result_subdir/results" - - if [ -d "$RESULTS_SUBDIR" ]; then - echo "Processing results from: $RESULTS_SUBDIR" - - # Find all concurrency result files with new format - CONCURRENCY_FILES=$(find "$RESULTS_SUBDIR" -name "results_concurrency_*_gpus_*.json") - - for result_file in $CONCURRENCY_FILES; do - if [ -f "$result_file" ]; then - # Extract concurrency and GPU count from filename - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed 's/results_concurrency_\([0-9]*\)_gpus_.*\.json/\1/') - gpus=$(echo "$filename" | sed 's/results_concurrency_.*_gpus_\([0-9]*\)\.json/\1/') - echo "Processing concurrency $concurrency with $gpus GPUs: $result_file" - - # Copy the result file to workspace with a unique name - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus${gpus}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" - - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi - done - else - echo "Results subdirectory not found: $RESULTS_SUBDIR" - fi -done \ No newline at end of file + ${IMAGE} ${ISL} ${OSL} \ No newline at end of file diff --git a/runners/launch_gb200-nv-copy-2.sh b/runners/launch_gb200-nv-copy-2.sh index 97d3b1fcd..bbdadcb9e 100755 --- a/runners/launch_gb200-nv-copy-2.sh +++ b/runners/launch_gb200-nv-copy-2.sh @@ -33,6 +33,70 @@ export OSL="$OSL" bash benchmarks/"${EXP_NAME%%_*}_${PRECISION}_gb200_${FRAMEWORK}_slurm.sh" +# Wait for all jobs to complete +echo "Waiting for all jobs to complete..." +while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do + echo "Jobs still running..." + squeue --steps -u $USER + sleep 30 +done + +# Find the logs directory (should be only one for this ISL/OSL combination) +LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) +if [ -z "$LOGS_DIR" ]; then + echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" + exit 1 +fi + +echo "Found logs directory: $LOGS_DIR" + +# Find all result subdirectories in this logs directory +RESULT_SUBDIRS=$(find "$LOGS_DIR" -name "ctx*_gen*_[td]ep*_batch*_eplb*_mtp*" -type d) + +if [ -z "$RESULT_SUBDIRS" ]; then + echo "No result subdirectories found in $LOGS_DIR" + exit 1 +fi + +echo "Found result subdirectories:" +echo "$RESULT_SUBDIRS" + +# Process results from all configurations +for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Process individual concurrency result files + RESULTS_SUBDIR="$result_subdir/results" + + if [ -d "$RESULTS_SUBDIR" ]; then + echo "Processing results from: $RESULTS_SUBDIR" + + # Find all concurrency result files with new format + CONCURRENCY_FILES=$(find "$RESULTS_SUBDIR" -name "results_concurrency_*_gpus_*.json") + + for result_file in $CONCURRENCY_FILES; do + if [ -f "$result_file" ]; then + # Extract concurrency and GPU count from filename + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed 's/results_concurrency_\([0-9]*\)_gpus_.*\.json/\1/') + gpus=$(echo "$filename" | sed 's/results_concurrency_.*_gpus_\([0-9]*\)\.json/\1/') + echo "Processing concurrency $concurrency with $gpus GPUs: $result_file" + + # Copy the result file to workspace with a unique name + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus${gpus}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + else + echo "Results subdirectory not found: $RESULTS_SUBDIR" + fi +done + # ### FRAMEWORK_DIFF_IF_STATEMENT #2 - difference in launching jobs # if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then From dbc5a37e8c6627eb6de5b99baa01449996b09c2d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 25 Nov 2025 09:00:36 -0600 Subject: [PATCH 28/98] testing --- .github/configs/nvidia-master.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 5735a5d41..0923d7ca9 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -822,7 +822,8 @@ dsr1-fp8-gb200-dynamo-sglang: osl: 1024 search-space: - spec-decoding: "none" - conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] + # conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] + conc-list: [ 128 ] prefill: num-worker: 6 # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: From ef127b2ea4d31446cb523efecc7b8c38ef3030c9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 25 Nov 2025 13:36:39 -0600 Subject: [PATCH 29/98] some changes to generate sweeps --- utils/matrix-logic/generate_sweep_configs.py | 232 +++++++++---------- 1 file changed, 113 insertions(+), 119 deletions(-) diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index c30e64ff4..f605f66e6 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -26,8 +26,8 @@ def seq_len_to_str(isl: int, osl: int) -> str: def generate_full_sweep(args, all_config_data, runner_data): """Generate full sweep configurations with optional filtering. - Supports filtering by model prefix, precision, framework, runner type, and sequence lengths. - Supports test mode to only run highest TP with lowest concurrency. + Supports filtering by model prefix, precision, framework, runner type, sequence lengths, + and max concurrency. All filters are optional - can generate sweeps for all configs or filter by specific criteria. @@ -88,78 +88,88 @@ def generate_full_sweep(args, all_config_data, runner_data): bmk_space = seq_config[Fields.SEARCH_SPACE.value] - if args.test_mode: - # In test mode, skip multinode configs for now + for bmk in bmk_space: if is_multinode: - continue - - # In test mode, use highest TP with lowest concurrency - highest_tp_bmk = max( - bmk_space, key=lambda x: x[Fields.TP.value]) - tp = highest_tp_bmk[Fields.TP.value] - conc = highest_tp_bmk[Fields.CONC_START.value] - ep = highest_tp_bmk.get(Fields.EP.value) - dp_attn = highest_tp_bmk.get(Fields.DP_ATTN.value) - - seq_len_str = seq_len_to_str(isl, osl) - entry = { - Fields.IMAGE.value: image, - Fields.MODEL.value: model, - Fields.PRECISION.value: precision, - Fields.FRAMEWORK.value: framework, - Fields.RUNNER.value: runner, - Fields.ISL.value: isl, - Fields.OSL.value: osl, - Fields.TP.value: tp, - Fields.EP.value: 1, # Default - Fields.DP_ATTN.value: False, # Default - Fields.CONC.value: conc, - Fields.MAX_MODEL_LEN.value: isl + osl + 200, - Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", - } - - if ep is not None: - entry[Fields.EP.value] = ep - if dp_attn is not None: - entry[Fields.DP_ATTN.value] = dp_attn - - validate_matrix_output(entry, is_multinode) - matrix_values.append(entry) - else: - # Full sweep mode - for bmk in bmk_space: - if is_multinode: - # Skip multinode configs when --single-node is specified - if not args.multi_node: - continue - - # Multinode configuration - # spec_decoding defaults to "none" if not specified - spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none") - - prefill = bmk[Fields.PREFILL.value] - decode = bmk[Fields.DECODE.value] - - # Get concurrency values (can be list or range) - conc_list = bmk.get(Fields.CONC_LIST.value) - # If it's a list - if conc_list: - conc_values = conc_list - # If it's a range - else: - conc_start = bmk[Fields.CONC_START.value] - conc_end = bmk[Fields.CONC_END.value] - conc_values = [] - conc = conc_start - while conc <= conc_end: - conc_values.append(conc) - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end - - # For multinode, create a single entry with conc as a list + # Skip multinode configs when --single-node is specified + if not args.multi_node: + continue + + # Multinode configuration + # spec_decoding defaults to "none" if not specified + spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none") + + prefill = bmk[Fields.PREFILL.value] + decode = bmk[Fields.DECODE.value] + + # Get concurrency values (can be list or range) + conc_list = bmk.get(Fields.CONC_LIST.value) + # If it's a list + if conc_list: + conc_values = conc_list + # If it's a range + else: + conc_start = bmk[Fields.CONC_START.value] + conc_end = bmk[Fields.CONC_END.value] + conc_values = [] + conc = conc_start + while conc <= conc_end: + conc_values.append(conc) + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end + + # Apply max-conc filter if specified + if args.max_conc is not None: + conc_values = [c for c in conc_values if c <= args.max_conc] + if not conc_values: + continue # Skip this bmk if no concurrency values remain + + # For multinode, create a single entry with conc as a list + seq_len_str = seq_len_to_str(isl, osl) + entry = { + Fields.IMAGE.value: image, + Fields.MODEL.value: model, + Fields.PRECISION.value: precision, + Fields.FRAMEWORK.value: framework, + Fields.RUNNER.value: runner, + Fields.ISL.value: isl, + Fields.OSL.value: osl, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.PREFILL.value: prefill, + Fields.DECODE.value: decode, + Fields.CONC.value: conc_values, # Pass the entire list for multinode + Fields.MAX_MODEL_LEN.value: isl + osl + 200, + Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + } + + validate_matrix_output(entry, is_multinode) + matrix_values.append(entry) + elif args.single_node: + # Single-node configuration + tp = bmk[Fields.TP.value] + conc_start = bmk[Fields.CONC_START.value] + conc_end = bmk[Fields.CONC_END.value] + ep = bmk.get(Fields.EP.value) + dp_attn = bmk.get(Fields.DP_ATTN.value) + + # Apply max-tp filter if specified + if args.max_tp and tp > args.max_tp: + continue + + # Apply max-ep filter if specified + if args.max_ep and ep is not None and ep > args.max_ep: + continue + + # Apply max-conc filter if specified + if args.max_conc is not None: + conc_end = min(conc_end, args.max_conc) + if conc_start > conc_end: + continue # Skip this bmk if conc_start exceeds max_conc + + conc = conc_start + while conc <= conc_end: seq_len_str = seq_len_to_str(isl, osl) entry = { Fields.IMAGE.value: image, @@ -169,56 +179,27 @@ def generate_full_sweep(args, all_config_data, runner_data): Fields.RUNNER.value: runner, Fields.ISL.value: isl, Fields.OSL.value: osl, - Fields.SPEC_DECODING.value: spec_decoding, - Fields.PREFILL.value: prefill, - Fields.DECODE.value: decode, - Fields.CONC.value: conc_values, # Pass the entire list for multinode + Fields.TP.value: tp, + Fields.CONC.value: conc, Fields.MAX_MODEL_LEN.value: isl + osl + 200, + Fields.EP.value: 1, # Default + Fields.DP_ATTN.value: False, # Default Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", } + if ep is not None: + entry[Fields.EP.value] = ep + if dp_attn is not None: + entry[Fields.DP_ATTN.value] = dp_attn + validate_matrix_output(entry, is_multinode) matrix_values.append(entry) - elif args.single_node: - # Single-node configuration - tp = bmk[Fields.TP.value] - conc_start = bmk[Fields.CONC_START.value] - conc_end = bmk[Fields.CONC_END.value] - ep = bmk.get(Fields.EP.value) - dp_attn = bmk.get(Fields.DP_ATTN.value) - - conc = conc_start - while conc <= conc_end: - seq_len_str = seq_len_to_str(isl, osl) - entry = { - Fields.IMAGE.value: image, - Fields.MODEL.value: model, - Fields.PRECISION.value: precision, - Fields.FRAMEWORK.value: framework, - Fields.RUNNER.value: runner, - Fields.ISL.value: isl, - Fields.OSL.value: osl, - Fields.TP.value: tp, - Fields.CONC.value: conc, - Fields.MAX_MODEL_LEN.value: isl + osl + 200, - Fields.EP.value: 1, # Default - Fields.DP_ATTN.value: False, # Default - Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", - } - - if ep is not None: - entry[Fields.EP.value] = ep - if dp_attn is not None: - entry[Fields.DP_ATTN.value] = dp_attn - - validate_matrix_output(entry, is_multinode) - matrix_values.append(entry) - if conc == conc_end: - break - conc *= args.step_size - if conc > conc_end: - conc = conc_end + if conc == conc_end: + break + conc *= args.step_size + if conc > conc_end: + conc = conc_end if len(matrix_values) == 0: raise ValueError("No configs found matching input filters.") @@ -411,9 +392,22 @@ def main(): help='Step size for concurrency values (default: 2)' ) full_sweep_parser.add_argument( - '--test-mode', - action='store_true', - help='Test mode: only run highest TP with lowest concurrency for each matching config' + '--max-conc', + type=int, + required=False, + help='Maximum concurrency value to include (filters out higher concurrency values)' + ) + full_sweep_parser.add_argument( + '--max-tp', + type=int, + required=False, + help='Maximum tensor parallelism value to include (single-node only)' + ) + full_sweep_parser.add_argument( + '--max-ep', + type=int, + required=False, + help='Maximum expert parallelism value to include (single-node only)' ) node_type_group = full_sweep_parser.add_mutually_exclusive_group(required=True) node_type_group.add_argument( From 96171cf1f3df824b7888fd3ce3d465234286c25c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 25 Nov 2025 17:14:32 -0600 Subject: [PATCH 30/98] testing and debugging --- .github/configs/nvidia-master.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0923d7ca9..0703bd420 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -800,7 +800,8 @@ dsr1-fp8-gb200-dynamo-sglang: osl: 1024 search-space: - spec-decoding: "none" - conc-list: [ 1024, 2048, 4096, 4608, 4864, 4992, 5120, 5376, 5632, 6144, 8192 ] + # conc-list: [ 1024, 2048, 4096, 4608, 4864, 4992, 5120, 5376, 5632, 6144, 8192 ] + conc-list: [ 1024, 2048 ] prefill: num-worker: 3 # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: From faad1fc321fc5c37d97b06225a9d4cdec3806c51 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 08:30:34 -0600 Subject: [PATCH 31/98] adding new file code for sglang --- runners/launch_gb200-nv-copy-2.sh | 333 ++++++++---------------------- 1 file changed, 86 insertions(+), 247 deletions(-) diff --git a/runners/launch_gb200-nv-copy-2.sh b/runners/launch_gb200-nv-copy-2.sh index bbdadcb9e..e4468555b 100755 --- a/runners/launch_gb200-nv-copy-2.sh +++ b/runners/launch_gb200-nv-copy-2.sh @@ -41,6 +41,10 @@ while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do sleep 30 done +# FIXME: The below is bad and is a result of the indirection of the ways in which +# Dynamo jobs are launched. In a follow-up PR, the location of the result file should not +# depend on the runner, it should always be in the same spot in the GH workspace. + # Find the logs directory (should be only one for this ISL/OSL combination) LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) if [ -z "$LOGS_DIR" ]; then @@ -62,252 +66,87 @@ echo "Found result subdirectories:" echo "$RESULT_SUBDIRS" # Process results from all configurations -for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Process individual concurrency result files - RESULTS_SUBDIR="$result_subdir/results" - - if [ -d "$RESULTS_SUBDIR" ]; then - echo "Processing results from: $RESULTS_SUBDIR" - - # Find all concurrency result files with new format - CONCURRENCY_FILES=$(find "$RESULTS_SUBDIR" -name "results_concurrency_*_gpus_*.json") - - for result_file in $CONCURRENCY_FILES; do - if [ -f "$result_file" ]; then - # Extract concurrency and GPU count from filename - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed 's/results_concurrency_\([0-9]*\)_gpus_.*\.json/\1/') - gpus=$(echo "$filename" | sed 's/results_concurrency_.*_gpus_\([0-9]*\)\.json/\1/') - echo "Processing concurrency $concurrency with $gpus GPUs: $result_file" - - # Copy the result file to workspace with a unique name - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus${gpus}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" - - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi - done - else - echo "Results subdirectory not found: $RESULTS_SUBDIR" +if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then + + # Find the logs directory (should be only one for this ISL/OSL combination) + LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) + if [ -z "$LOGS_DIR" ]; then + echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" + exit 1 fi -done -# ### FRAMEWORK_DIFF_IF_STATEMENT #2 - difference in launching jobs -# if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then - -# # Set up Dynamo repository path -# DYNAMO_PATH="/mnt/lustre01/users/sa-shared/benchmarks/dynamo" -# PERFORMANCE_SWEEPS_PATH="$DYNAMO_PATH/components/backends/trtllm/performance_sweeps" - -# # Overview: -# # The Dynamo repository contains the bench_serving repository as a submodule. -# # The submit_disagg.sh script, located at $PERFORMANCE_SWEEPS_PATH, orchestrates the entire benchmarking workflow: -# # 1. Launches the Dynamo inference service with the specified configuration. -# # 2. Waits for the service to become healthy. -# # 3. Initiates benchmarking using the bench_serving tools. -# # 4. Monitors all jobs until completion. -# # 5. Collects and processes the results. - -# # Always clone and setup Dynamo -# echo "Cloning Dynamo repository..." -# rm -rf "$DYNAMO_PATH" -# git clone https://github.com/cquil11/dynamo.git "$DYNAMO_PATH" -# cd "$DYNAMO_PATH" -# git checkout release/0.5.1-rc0.20251105-cam -# git submodule update --init --recursive - -# # Navigate to performance sweeps directory -# cd "$PERFORMANCE_SWEEPS_PATH" - -# # 1. CACHE_TRANSCEIVER_MAX_NUM_TOKENS controls the max_tokens_in_buffer value -# # in cache_transceiver_config of TensorRT-LLM context and generation workers. -# # Specifically, it is the max number of tokens the transfer buffer can fit. - -# # Set up environment variables based on ISL/OSL -# if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then -# export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608 -# elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then -# export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448 -# else -# echo "Unsupported ISL/OSL combination: $ISL/$OSL" -# exit 1 -# fi - -# # New stuff -# # CONC_LIST -# # ISL -# # OSL -# # IMAGE - -# # PREFILL_NUM_WORKERS -# # PREFILL_TP -# # PREFILL_EP -# # PREFILL_DP_ATTN - -# # DECODE_NUM_WORKERS -# # DECODE_TP -# # DECODE_EP -# # DECODE_DP_ATTN - -# # Additional env vars needed -# # PREFILL_MAX_NUM_TOKENS -# # PREFILL_MAX_BATCH_SIZE -# # DECODE_MAX_NUM_TOKENS -# # DECODE_MAX_BATCH_SIZE -# # DECODE_GPU_MEM_FRACTION -# # DECODE_MTP_SIZE -# # DECODE_EPLB_NUM_SLOTS - -# echo "CONC_LIST=$CONC_LIST" -# echo "ISL=$ISL" -# echo "OSL=$OSL" -# echo "IMAGE=$IMAGE" - -# echo "PREFILL_NUM_WORKERS=$PREFILL_NUM_WORKERS" -# echo "PREFILL_TP=$PREFILL_TP" -# echo "PREFILL_EP=$PREFILL_EP" -# echo "PREFILL_DP_ATTN=$PREFILL_DP_ATTN" - -# echo "DECODE_NUM_WORKERS=$DECODE_NUM_WORKERS" -# echo "DECODE_TP=$DECODE_TP" -# echo "DECODE_EP=$DECODE_EP" -# echo "DECODE_DP_ATTN=$DECODE_DP_ATTN" - -# echo "PREFILL_MAX_NUM_TOKENS=$PREFILL_MAX_NUM_TOKENS" -# echo "PREFILL_MAX_BATCH_SIZE=$PREFILL_MAX_BATCH_SIZE" -# echo "DECODE_MAX_NUM_TOKENS=$DECODE_MAX_NUM_TOKENS" -# echo "DECODE_MAX_BATCH_SIZE=$DECODE_MAX_BATCH_SIZE" -# echo "DECODE_GPU_MEM_FRACTION=$DECODE_GPU_MEM_FRACTION" -# echo "DECODE_MTP_SIZE=$DECODE_MTP_SIZE" -# echo "DECODE_EPLB_NUM_SLOTS=$DECODE_EPLB_NUM_SLOTS" - -# # For GB200, we use 4 tasks per node. -# ntasks_per_node=4 -# additional_slurm_args="--time=04:00:00" - -# kind=dynamo_disagg - -# gen_nodes=$(((DECODE_TP + 3)/4 * DECODE_NUM_WORKERS)) -# total_nodes=$((PREFILL_NUM_WORKERS + gen_nodes)) -# total_tasks=$((total_nodes * ntasks_per_node)) - -# # 4608 prefill max num toks originally -# if [ $ISL == $OSL ]; then -# sbatch --nodes=${total_nodes} \ -# --ntasks=${total_tasks} \ -# --ntasks-per-node=${ntasks_per_node} \ -# --segment=${total_nodes} ${additional_slurm_args} \ -# benchmark_disagg.slurm \ -# ${PREFILL_NUM_WORKERS} ${PREFILL_TP} \ -# ${PREFILL_MAX_BATCH_SIZE} ${PREFILL_MAX_NUM_TOKENS} \ -# ${PREFILL_DP_ATTN} ${DECODE_NUM_WORKERS} \ -# ${DECODE_TP} ${DECODE_MAX_BATCH_SIZE} \ -# ${DECODE_MAX_NUM_TOKENS} ${DECODE_DP_ATTN} \ -# ${DECODE_GPU_MEM_FRACTION} ${DECODE_EPLB_NUM_SLOTS} \ -# ${DECODE_MTP_SIZE} ${CONC_LIST} \ -# ${gen_nodes} ${kind} \ -# ${MODEL_PATH} ${SERVED_MODEL_NAME} \ -# ${IMAGE} ${ISL} ${OSL} -# # else -# # sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} -# fi -# fi - -# # Wait for all jobs to complete -# # echo "Waiting for all jobs to complete..." -# while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do -# echo "Jobs still running..." -# squeue --steps -u $USER -# sleep 60 -# done - -# ### FRAMEWORK_DIFF_IF_STATEMENT #3 - difference in log post-processing -# if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then - -# # Find the logs directory (should be only one for this ISL/OSL combination) -# LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) -# if [ -z "$LOGS_DIR" ]; then -# echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" -# exit 1 -# fi - -# echo "Found logs directory: $LOGS_DIR" - -# # Find all result subdirectories in this logs directory -# RESULT_SUBDIRS=$(find "$LOGS_DIR" -name "ctx*_gen*_[td]ep*_batch*_eplb*_mtp*" -type d) - -# if [ -z "$RESULT_SUBDIRS" ]; then -# echo "No result subdirectories found in $LOGS_DIR" -# exit 1 -# fi - -# echo "Found result subdirectories:" -# echo "$RESULT_SUBDIRS" - -# # Process results from all configurations -# for result_subdir in $RESULT_SUBDIRS; do -# echo "Processing result subdirectory: $result_subdir" - -# # Extract configuration info from directory name -# CONFIG_NAME=$(basename "$result_subdir") - -# # Process individual concurrency result files -# RESULTS_SUBDIR="$result_subdir/results" - -# if [ -d "$RESULTS_SUBDIR" ]; then -# echo "Processing results from: $RESULTS_SUBDIR" - -# # Find all concurrency result files with new format -# CONCURRENCY_FILES=$(find "$RESULTS_SUBDIR" -name "results_concurrency_*_gpus_*.json") - -# for result_file in $CONCURRENCY_FILES; do -# if [ -f "$result_file" ]; then -# # Extract concurrency and GPU count from filename -# filename=$(basename "$result_file") -# concurrency=$(echo "$filename" | sed 's/results_concurrency_\([0-9]*\)_gpus_.*\.json/\1/') -# gpus=$(echo "$filename" | sed 's/results_concurrency_.*_gpus_\([0-9]*\)\.json/\1/') -# echo "Processing concurrency $concurrency with $gpus GPUs: $result_file" - -# # Copy the result file to workspace with a unique name -# WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus${gpus}.json" -# cp "$result_file" "$WORKSPACE_RESULT_FILE" - -# echo "Copied result file to: $WORKSPACE_RESULT_FILE" -# fi -# done -# else -# echo "Results subdirectory not found: $RESULTS_SUBDIR" -# fi -# done - -# else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement -# # Find the latest log directory -# # we do "tail -1" here since only the latest job will yield the result -# LOGS_DIR=$(find logs/*/vllm_isl_${ISL}_osl_${OSL} -type d | sort -V | tail -1) -# if [ -z "$LOGS_DIR" ]; then -# echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" -# exit 1 -# fi - -# echo "Found logs directory: $LOGS_DIR" -# ls $LOGS_DIR - -# # Result JSON are contained within the result directory -# for result_file in $(find $LOGS_DIR -type f); do -# # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json -# file_name=$(basename $result_file) -# if [ -f $result_file ]; then -# # Copy the result file to workspace with a unique name -# WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" -# echo "Found result file ${result_file}. Copying them to ${WORKSPACE_RESULT_FILE}" -# cp $result_file $WORKSPACE_RESULT_FILE -# fi -# done -# fi - -# echo "All result files processed" \ No newline at end of file + echo "Found logs directory: $LOGS_DIR" + + # Find all result subdirectories in this logs directory + RESULT_SUBDIRS=$(find "$LOGS_DIR" -name "ctx*_gen*_[td]ep*_batch*_eplb*_mtp*" -type d) + + if [ -z "$RESULT_SUBDIRS" ]; then + echo "No result subdirectories found in $LOGS_DIR" + exit 1 + fi + + echo "Found result subdirectories:" + echo "$RESULT_SUBDIRS" + + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Process individual concurrency result files + RESULTS_SUBDIR="$result_subdir/results" + + if [ -d "$RESULTS_SUBDIR" ]; then + echo "Processing results from: $RESULTS_SUBDIR" + + # Find all concurrency result files with new format + CONCURRENCY_FILES=$(find "$RESULTS_SUBDIR" -name "results_concurrency_*_gpus_*.json") + + for result_file in $CONCURRENCY_FILES; do + if [ -f "$result_file" ]; then + # Extract concurrency and GPU count from filename + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed 's/results_concurrency_\([0-9]*\)_gpus_.*\.json/\1/') + gpus=$(echo "$filename" | sed 's/results_concurrency_.*_gpus_\([0-9]*\)\.json/\1/') + echo "Processing concurrency $concurrency with $gpus GPUs: $result_file" + + # Copy the result file to workspace with a unique name + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus${gpus}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + else + echo "Results subdirectory not found: $RESULTS_SUBDIR" + fi + done + +else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement + # Find the latest log directory + # we do "tail -1" here since only the latest job will yield the result + LOGS_DIR=$(find logs/*/vllm_isl_${ISL}_osl_${OSL} -type d | sort -V | tail -1) + if [ -z "$LOGS_DIR" ]; then + echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" + exit 1 + fi + + echo "Found logs directory: $LOGS_DIR" + ls $LOGS_DIR + + # Result JSON are contained within the result directory + for result_file in $(find $LOGS_DIR -type f); do + # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json + file_name=$(basename $result_file) + if [ -f $result_file ]; then + # Copy the result file to workspace with a unique name + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" + echo "Found result file ${result_file}. Copying them to ${WORKSPACE_RESULT_FILE}" + cp $result_file $WORKSPACE_RESULT_FILE + fi + done +fi + +echo "All result files processed" \ No newline at end of file From 4b615dab8aaef83e98ca2dfd2efe01c2504e50db Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 09:06:46 -0600 Subject: [PATCH 32/98] adding new file code for sglang --- runners/launch_gb200-nv-copy-2.sh | 21 --------------------- 1 file changed, 21 deletions(-) diff --git a/runners/launch_gb200-nv-copy-2.sh b/runners/launch_gb200-nv-copy-2.sh index e4468555b..26f5d51c9 100755 --- a/runners/launch_gb200-nv-copy-2.sh +++ b/runners/launch_gb200-nv-copy-2.sh @@ -45,26 +45,6 @@ done # Dynamo jobs are launched. In a follow-up PR, the location of the result file should not # depend on the runner, it should always be in the same spot in the GH workspace. -# Find the logs directory (should be only one for this ISL/OSL combination) -LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) -if [ -z "$LOGS_DIR" ]; then - echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" - exit 1 -fi - -echo "Found logs directory: $LOGS_DIR" - -# Find all result subdirectories in this logs directory -RESULT_SUBDIRS=$(find "$LOGS_DIR" -name "ctx*_gen*_[td]ep*_batch*_eplb*_mtp*" -type d) - -if [ -z "$RESULT_SUBDIRS" ]; then - echo "No result subdirectories found in $LOGS_DIR" - exit 1 -fi - -echo "Found result subdirectories:" -echo "$RESULT_SUBDIRS" - # Process results from all configurations if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then @@ -123,7 +103,6 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then echo "Results subdirectory not found: $RESULTS_SUBDIR" fi done - else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement # Find the latest log directory # we do "tail -1" here since only the latest job will yield the result From b25b7304aee0ed269a77facebe0cdb8e1aba4b0d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 09:44:30 -0600 Subject: [PATCH 33/98] changing file path --- runners/launch_gb200-nv-copy-2.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_gb200-nv-copy-2.sh b/runners/launch_gb200-nv-copy-2.sh index 26f5d51c9..c65dd59d6 100755 --- a/runners/launch_gb200-nv-copy-2.sh +++ b/runners/launch_gb200-nv-copy-2.sh @@ -106,14 +106,14 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement # Find the latest log directory # we do "tail -1" here since only the latest job will yield the result - LOGS_DIR=$(find logs/*/vllm_isl_${ISL}_osl_${OSL} -type d | sort -V | tail -1) + LOGS_DIR=$(find . -path "*/logs/*/vllm_isl_${ISL}_osl_${OSL}" -type d 2>/dev/null | sort -V | tail -1) if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 fi echo "Found logs directory: $LOGS_DIR" - ls $LOGS_DIR + ls -la $LOGS_DIR # Result JSON are contained within the result directory for result_file in $(find $LOGS_DIR -type f); do From 4f505221d9066217e744de1db2edee8b398f1e45 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 10:27:02 -0600 Subject: [PATCH 34/98] updating multinode fn hash --- .github/workflows/benchmark-multinode-tmpl.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index e9f4acbb5..f21b45fa5 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -105,7 +105,7 @@ jobs: benchmark: runs-on: ${{ inputs.runner }} timeout-minutes: 480 - name: "${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} mtp-${{ inputs.mtp-mode }}" + name: "${{ inputs.exp-name }} ${{ inputs.runner }} ${{ inputs.framework }} ${{ inputs.precision }} specdecod-${{ inputs.spec-decoding }}" steps: - name: Resource cleanup @@ -125,7 +125,7 @@ jobs: - name: Launch multi-node job script env: RUNNER_NAME: ${{ runner.name }} - RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_${{ env.FRAMEWORK }}_specdec-${{ env.SPEC_DECODING }}_${{ runner.name }} + RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_ptp${{ env.PREFILL_TP }}pep${{ env.PREFILL_EP}}_dtp${{ env.DECODE_TP}}dep${{ env.DECODE_EP }}_${{ env.FRAMEWORK }}_specdec-${{ env.SPEC_DECODING }}_conc${{ join(fromJson(inputs.conc-list), 'x') }}_${{ runner.name }} run: | # bash ./runners/launch_${RUNNER_NAME%%_*}.sh set -x From 3be1d1b34561be8143caec545dbdcbd63f7dab5b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 10:27:14 -0600 Subject: [PATCH 35/98] updating multinode fn hash --- .github/workflows/full-sweep-1k1k-scheduler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 48e89cba9..af28be1a3 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -15,7 +15,7 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --framework dynamo-sglang --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --framework dynamo-trt --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo $CONFIG_JSON_MULTI_NODE From a033f5dc58f23b2ec9e5b0b1c79821288b955814 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 10:54:32 -0600 Subject: [PATCH 36/98] dynamo trtllm to dynamo trt --- runners/launch_gb200-nv-copy-2.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/runners/launch_gb200-nv-copy-2.sh b/runners/launch_gb200-nv-copy-2.sh index c65dd59d6..e4d3827c3 100755 --- a/runners/launch_gb200-nv-copy-2.sh +++ b/runners/launch_gb200-nv-copy-2.sh @@ -46,7 +46,7 @@ done # depend on the runner, it should always be in the same spot in the GH workspace. # Process results from all configurations -if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then +if [[ $FRAMEWORK == "dynamo-trt" ]]; then # Find the logs directory (should be only one for this ISL/OSL combination) LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) From 3f09a0e42744761ccd5c9d206d0f63aad5657841 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 15:51:38 -0600 Subject: [PATCH 37/98] changing process result --- .github/configs/nvidia-master.yaml | 2 + .../workflows/benchmark-multinode-tmpl.yml | 4 + .../workflows/full-sweep-1k1k-scheduler.yml | 3 +- utils/matrix-logic/generate_sweep_configs.py | 7 + utils/matrix-logic/validation.py | 5 + utils/process_result.py | 126 +++++++++++++----- 6 files changed, 113 insertions(+), 34 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 0703bd420..ea6d9026d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -334,6 +334,7 @@ dsr1-fp4-gb200-dynamo-trt: precision: fp4 framework: dynamo-trt multinode: true + disagg: true seq-len-configs: - isl: 1024 osl: 1024 @@ -795,6 +796,7 @@ dsr1-fp8-gb200-dynamo-sglang: precision: fp8 framework: dynamo-sglang multinode: true + disagg: true seq-len-configs: - isl: 1024 osl: 1024 diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index f21b45fa5..99e42fd8a 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -33,6 +33,9 @@ on: spec-decoding: required: true type: string + disagg: + required: true + type: string max-model-len: required: true @@ -87,6 +90,7 @@ env: RANDOM_RANGE_RATIO: ${{ inputs.random-range-ratio }} CONC_LIST: ${{ join(fromJson(inputs.conc-list), ' ') }} SPEC_DECODING: ${{ inputs.spec-decoding }} + DISAGG: ${{ inputs.disagg }} PREFILL_NUM_WORKERS: ${{ inputs.prefill-num-worker }} PREFILL_TP: ${{ inputs.prefill-tp }} diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index af28be1a3..f5b552af5 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -15,7 +15,7 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --framework dynamo-trt --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo $CONFIG_JSON_MULTI_NODE @@ -54,6 +54,7 @@ jobs: exp-name: "dsr1_1k1k" conc-list: ${{ toJson(matrix.config.conc) }} spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} prefill-num-worker: ${{ matrix.config.prefill.num-worker }} prefill-tp: ${{ matrix.config.prefill.tp }} diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index f605f66e6..58bfa6c99 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -69,6 +69,8 @@ def generate_full_sweep(args, all_config_data, runner_data): # Check if this is a multinode config is_multinode = val.get(Fields.MULTINODE.value, False) + # Get disagg value, defaulting to False if not specified + disagg = val.get(Fields.DISAGG.value, False) seq_len_configs = val[Fields.SEQ_LEN_CONFIGS.value] image = val[Fields.IMAGE.value] @@ -142,6 +144,7 @@ def generate_full_sweep(args, all_config_data, runner_data): Fields.CONC.value: conc_values, # Pass the entire list for multinode Fields.MAX_MODEL_LEN.value: isl + osl + 200, Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + Fields.DISAGG.value: disagg, } validate_matrix_output(entry, is_multinode) @@ -185,6 +188,7 @@ def generate_full_sweep(args, all_config_data, runner_data): Fields.EP.value: 1, # Default Fields.DP_ATTN.value: False, # Default Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", + Fields.DISAGG.value: disagg, } if ep is not None: @@ -234,6 +238,8 @@ def generate_runner_model_sweep_config(args, all_config_data, runner_data): # Get model code for exp_name model_code = val[Fields.MODEL_PREFIX.value] + # Get disagg value, defaulting to False if not specified + disagg = val.get(Fields.DISAGG.value, False) # Find 1k1k config target_config = None @@ -269,6 +275,7 @@ def generate_runner_model_sweep_config(args, all_config_data, runner_data): Fields.CONC.value: lowest_conc, Fields.MAX_MODEL_LEN.value: 2048, Fields.EXP_NAME.value: f"{model_code}_test", + Fields.DISAGG.value: disagg, } # Add optional fields if they exist diff --git a/utils/matrix-logic/validation.py b/utils/matrix-logic/validation.py index 887baafc1..16696c888 100644 --- a/utils/matrix-logic/validation.py +++ b/utils/matrix-logic/validation.py @@ -43,6 +43,7 @@ class Fields(Enum): CONC = 'conc' MAX_MODEL_LEN = 'max-model-len' EXP_NAME = 'exp-name' + DISAGG = 'disagg' class SingleNodeMatrixEntry(BaseModel): @@ -67,6 +68,7 @@ class SingleNodeMatrixEntry(BaseModel): conc: Union[int, List[int]] max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value) exp_name: str = Field(alias=Fields.EXP_NAME.value) + disagg: bool class WorkerConfig(BaseModel): @@ -101,6 +103,7 @@ class MultiNodeMatrixEntry(BaseModel): conc: List[int] max_model_len: int = Field(alias=Fields.MAX_MODEL_LEN.value) exp_name: str = Field(alias=Fields.EXP_NAME.value) + disagg: bool def validate_matrix_output(entry: dict, is_multinode: bool) -> dict: @@ -236,6 +239,7 @@ class SingleNodeMasterConfigEntry(BaseModel): framework: str runner: str multinode: Literal[False] + disagg: bool = Field(default=False) seq_len_configs: List[SingleNodeSeqLenConfig] = Field( alias=Fields.SEQ_LEN_CONFIGS.value) @@ -251,6 +255,7 @@ class MultiNodeMasterConfigEntry(BaseModel): framework: str runner: str multinode: Literal[True] + disagg: bool = Field(default=False) seq_len_configs: List[MultiNodeSeqLenConfig] = Field( alias=Fields.SEQ_LEN_CONFIGS.value) diff --git a/utils/process_result.py b/utils/process_result.py index 1a59ce301..fa22df56d 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -4,58 +4,118 @@ from pathlib import Path -hw = os.environ.get('RUNNER_TYPE') -tp_size = int(os.environ.get('TP')) -ep_size = int(os.environ.get('EP_SIZE')) -prefill_gpus_str = os.environ.get('PREFILL_GPUS', '') -decode_gpus_str = os.environ.get('DECODE_GPUS', '') - -# If empty string (aggregated runs), assign to tp_size (total gpus), otherwise convert to int -prefill_gpus = tp_size if not prefill_gpus_str else int(prefill_gpus_str) -decode_gpus = tp_size if not decode_gpus_str else int(decode_gpus_str) -dp_attention = os.environ.get('DP_ATTENTION') -result_filename = os.environ.get('RESULT_FILENAME') -framework = os.environ.get('FRAMEWORK') -precision = os.environ.get('PRECISION') -mtp_mode = os.environ.get('MTP_MODE') -isl = os.environ.get('ISL') -osl = os.environ.get('OSL') +def get_required_env_vars(required_vars): + """Load and validate required environment variables.""" + env_values = {} + missing_env_vars = [] + + for var_name in required_vars: + value = os.environ.get(var_name) + if value is None: + missing_env_vars.append(var_name) + env_values[var_name] = value + + if missing_env_vars: + raise EnvironmentError( + f"Missing required environment variables: {', '.join(missing_env_vars)}") + + return env_values + + +# Base required env vars +base_env = get_required_env_vars([ + 'RUNNER_TYPE', 'FRAMEWORK', 'PRECISION', 'SPEC_DECODING', + 'RESULT_FILENAME', 'ISL', 'OSL', 'DISAGG' +]) + +hw = base_env['RUNNER_TYPE'] +framework = base_env['FRAMEWORK'] +precision = base_env['PRECISION'] +spec_decoding = base_env['SPEC_DECODING'] +disagg = base_env['DISAGG'].lower() == 'true' +result_filename = base_env['RESULT_FILENAME'] +isl = base_env['ISL'] +osl = base_env['OSL'] with open(f'{result_filename}.json') as f: bmk_result = json.load(f) data = { 'hw': hw, - 'tp': tp_size, - 'ep': ep_size, - 'dp_attention': dp_attention, # true or false 'conc': int(bmk_result['max_concurrency']), 'model': bmk_result['model_id'], 'framework': framework, 'precision': precision, - 'isl': int(isl) if isl else None, - 'osl': int(osl) if osl else None, - 'tput_per_gpu': float(bmk_result['total_token_throughput']) / tp_size, - 'output_tput_per_gpu': float(bmk_result['output_throughput']) / decode_gpus, - 'input_tput_per_gpu': (float(bmk_result['total_token_throughput']) - float(bmk_result['output_throughput']) )/ prefill_gpus + 'spec_decoding': spec_decoding, + 'disagg': disagg, + 'isl': int(isl), + 'osl': int(osl), } -# Check if both PREFILL_GPUS and DECODE_GPUS env vars exist and are not empty -if prefill_gpus_str and decode_gpus_str: - data['disagg'] = True - data['num_prefill_gpu'] = prefill_gpus - data['num_decode_gpu'] = decode_gpus +is_multinode = os.environ.get('IS_MULTINODE', 'false').lower() == 'true' + +if is_multinode: + # TODO: Eventually will have to have a separate condition in here for multinode disagg and + # multinode agg. For now, just assume that multinode implies disagg. + + multinode_env = get_required_env_vars(['PREFILL_GPUS', 'DECODE_GPUS', 'PREFILL_NUM_WORKERS', 'PREFILL_NUM_TP', + 'PREFILL_EP', 'PREFILL_DP_ATTN', 'DECODE_NUM_WORKERS', 'DECODE_TP', 'DECODE_EP', 'DECODE_DP_ATTN']) + prefill_gpus = int(multinode_env['PREFILL_GPUS']) + decode_gpus = int(multinode_env['DECODE_GPUS']) + prefill_num_workers = int(multinode_env['PREFILL_NUM_WORKERS']) + prefill_tp = int(multinode_env['PREFILL_NUM_TP']) + prefill_ep = int(multinode_env['PREFILL_EP']) + prefill_dp_attn = multinode_env['PREFILL_DP_ATTN'] + decode_num_workers = int(multinode_env['DECODE_NUM_WORKERS']) + decode_tp = int(multinode_env['DECODE_TP']) + decode_ep = int(multinode_env['DECODE_EP']) + decode_dp_attn = multinode_env['DECODE_DP_ATTN'] + + multi_node_data = { + 'is_multinode': True, + 'prefill_tp': prefill_tp, + 'prefill_ep': prefill_ep, + 'prefill_dp_attention': prefill_dp_attn, + 'prefill_num_workers': prefill_num_workers, + 'decode_tp': decode_tp, + 'decode_ep': decode_ep, + 'decode_dp_attention': decode_dp_attn, + 'decode_num_workers': decode_num_workers, + 'num_prefill_gpu': prefill_gpus, + 'num_decode_gpu': decode_gpus, + 'tput_per_gpu': float(bmk_result['total_token_throughput']) / (prefill_gpus + decode_gpus), + 'output_tput_per_gpu': float(bmk_result['output_throughput']) / decode_gpus, + 'input_tput_per_gpu': (float(bmk_result['total_token_throughput']) - float(bmk_result['output_throughput'])) / prefill_gpus, + } + + data = data | multi_node_data else: - data['disagg'] = False + if disagg: + raise ValueError("Disaggregated mode requires multinode setup.") + + single_node_env = get_required_env_vars(['TP', 'EP_SIZE', 'DP_ATTENTION']) + tp_size = int(single_node_env['TP']) + ep_size = int(single_node_env['EP_SIZE']) + dp_attention = single_node_env['DP_ATTENTION'] + + single_node_data = { + 'is_multinode': False, + 'tp': tp_size, + 'ep': ep_size, + 'dp_attention': dp_attention, + 'tput_per_gpu': float(bmk_result['total_token_throughput']) / tp_size, + 'output_tput_per_gpu': float(bmk_result['output_throughput']) / tp_size, + 'input_tput_per_gpu': (float(bmk_result['total_token_throughput']) - float(bmk_result['output_throughput'])) / tp_size, + } -if mtp_mode: # MTP - data['mtp'] = mtp_mode + data = data | single_node_data for key, value in bmk_result.items(): if key.endswith('ms'): data[key.replace('_ms', '')] = float(value) / 1000.0 if 'tpot' in key: - data[key.replace('_ms', '').replace('tpot', 'intvty')] = 1000.0 / float(value) + data[key.replace('_ms', '').replace( + 'tpot', 'intvty')] = 1000.0 / float(value) print(json.dumps(data, indent=2)) From a66597ddf11ad58854a00a0a1a461d5d4378b561 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 16:23:04 -0600 Subject: [PATCH 38/98] add is multinode --- .github/workflows/benchmark-multinode-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 99e42fd8a..aee825538 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -159,7 +159,7 @@ jobs: if [ -n "$gpus" ]; then echo "Extracted: gpus=$gpus, prefill_gpus=$prefill_gpus, decode_gpus=$decode_gpus" - TP=$gpus RESULT_FILENAME=${result_file%.json} EP_SIZE=1 DP_ATTENTION=false PREFILL_GPUS="$prefill_gpus" DECODE_GPUS="$decode_gpus" python3 utils/process_result.py + IS_MULTINODE=true PREFILL_GPUS="$prefill_gpus" DECODE_GPUS="$decode_gpus" python3 utils/process_result.py fi fi done From 0c22a964ddb04a07ee415a6288b1f3883ef5e7ec Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 20:10:34 -0600 Subject: [PATCH 39/98] bug fix --- .github/workflows/benchmark-multinode-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index aee825538..3ff8cc20a 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -159,7 +159,7 @@ jobs: if [ -n "$gpus" ]; then echo "Extracted: gpus=$gpus, prefill_gpus=$prefill_gpus, decode_gpus=$decode_gpus" - IS_MULTINODE=true PREFILL_GPUS="$prefill_gpus" DECODE_GPUS="$decode_gpus" python3 utils/process_result.py + RESULT_FILENAME="$result_file" IS_MULTINODE=true PREFILL_GPUS="$prefill_gpus" DECODE_GPUS="$decode_gpus" python3 utils/process_result.py fi fi done From 6cec238c6495fa9a5bd04baae43fe7592c38dfe0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 20:29:25 -0600 Subject: [PATCH 40/98] bug fix --- .github/workflows/benchmark-multinode-tmpl.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 3ff8cc20a..d48fd94ba 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -159,7 +159,7 @@ jobs: if [ -n "$gpus" ]; then echo "Extracted: gpus=$gpus, prefill_gpus=$prefill_gpus, decode_gpus=$decode_gpus" - RESULT_FILENAME="$result_file" IS_MULTINODE=true PREFILL_GPUS="$prefill_gpus" DECODE_GPUS="$decode_gpus" python3 utils/process_result.py + RESULT_FILENAME=${result_file%.json} IS_MULTINODE=true PREFILL_GPUS="$prefill_gpus" DECODE_GPUS="$decode_gpus" python3 utils/process_result.py fi fi done From 6ad9a8e84d90589fe45850e4ed661e9e16c595db Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 20:48:27 -0600 Subject: [PATCH 41/98] bug fix --- utils/process_result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/process_result.py b/utils/process_result.py index fa22df56d..54209e842 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -58,7 +58,7 @@ def get_required_env_vars(required_vars): # TODO: Eventually will have to have a separate condition in here for multinode disagg and # multinode agg. For now, just assume that multinode implies disagg. - multinode_env = get_required_env_vars(['PREFILL_GPUS', 'DECODE_GPUS', 'PREFILL_NUM_WORKERS', 'PREFILL_NUM_TP', + multinode_env = get_required_env_vars(['PREFILL_GPUS', 'DECODE_GPUS', 'PREFILL_NUM_WORKERS', 'PREFILL_TP', 'PREFILL_EP', 'PREFILL_DP_ATTN', 'DECODE_NUM_WORKERS', 'DECODE_TP', 'DECODE_EP', 'DECODE_DP_ATTN']) prefill_gpus = int(multinode_env['PREFILL_GPUS']) decode_gpus = int(multinode_env['DECODE_GPUS']) From ea8ed1bd77c9d88310bdefafa04554ce56e1f123 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 21:18:15 -0600 Subject: [PATCH 42/98] bug fix --- utils/process_result.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utils/process_result.py b/utils/process_result.py index 54209e842..77f8c2a4b 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -63,7 +63,7 @@ def get_required_env_vars(required_vars): prefill_gpus = int(multinode_env['PREFILL_GPUS']) decode_gpus = int(multinode_env['DECODE_GPUS']) prefill_num_workers = int(multinode_env['PREFILL_NUM_WORKERS']) - prefill_tp = int(multinode_env['PREFILL_NUM_TP']) + prefill_tp = int(multinode_env['PREFILL_TP']) prefill_ep = int(multinode_env['PREFILL_EP']) prefill_dp_attn = multinode_env['PREFILL_DP_ATTN'] decode_num_workers = int(multinode_env['DECODE_NUM_WORKERS']) From 07f4af95238c7e7bfbcc93442093cf75a02b1a27 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 22:39:36 -0600 Subject: [PATCH 43/98] polishing --- .../workflows/full-sweep-1k1k-scheduler.yml | 163 ++++++++---- .../workflows/full-sweep-1k8k-scheduler.yml | 114 ++++++-- .../workflows/full-sweep-8k1k-scheduler.yml | 137 ++++++---- .../dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 17 +- runners/launch_gb200-nv-copy-2.sh | 131 ---------- runners/launch_gb200-nv-copy.sh | 245 ------------------ runners/launch_gb200-nv.sh | 185 ++----------- utils/matrix-logic/generate_sweep_configs.py | 3 - 8 files changed, 312 insertions(+), 683 deletions(-) delete mode 100755 runners/launch_gb200-nv-copy-2.sh delete mode 100755 runners/launch_gb200-nv-copy.sh diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index f5b552af5..220abdb5c 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -8,6 +8,7 @@ jobs: runs-on: ubuntu-latest outputs: multi-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.multi-node-search-space-config }} + single-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.single-node-search-space-config }} steps: - name: Checkout code uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 @@ -16,25 +17,30 @@ jobs: run: | pip install pydantic CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT - echo $CONFIG_JSON_MULTI_NODE - - # get-gptoss-configs: - # runs-on: ubuntu-latest - # outputs: - # multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }} - # steps: - # - name: Checkout code - # uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 - - # - id: get-gptoss-configs - # run: | - # pip install pydantic - # CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --node-type multinode --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss) - # echo "multi-node-search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT + + get-gptoss-configs: + runs-on: ubuntu-latest + outputs: + multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }} + single-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.single-node-search-space-config }} + steps: + - name: Checkout code + uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 + + - id: get-gptoss-configs + run: | + pip install pydantic + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT benchmark-dsr1-multi-node: needs: get-dsr1-configs + if: ${{ needs.get-dsr1-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml name: dsr1 1k1k / strategy: @@ -68,49 +74,110 @@ jobs: decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + benchmark-dsr1: + needs: get-dsr1-configs + if: ${{ needs.get-dsr1-configs.outputs.single-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: dsr1 1k1k / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-dsr1-configs.outputs.single-node-search-space-config) }} + secrets: inherit + with: + exp-name: "dsr1_1k1k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + benchmark-gptoss-single-node: + needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: gptoss 1k1k / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-gptoss-configs.outputs.single-node-search-space-config) }} + secrets: inherit + with: + exp-name: "gptoss_1k1k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + + benchmark-gptoss-multi-node: + needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: dsr1 1k1k / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }} + secrets: inherit + with: + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: "dsr1_1k1k" + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} - # benchmark-gptoss-multi-node: - # needs: get-gptoss-configs - # uses: ./.github/workflows/benchmark-multinode-tmpl.yml - # name: gptoss 1k1k / - # strategy: - # fail-fast: false - # matrix: - # config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }} - # secrets: inherit - # with: - # exp-name: "gptoss_1k1k" - # isl: 1024 - # osl: 1024 - # max-model-len: 2048 - # runner: ${{ matrix.config.runner }} - # image: ${{ matrix.config.image }} - # model: ${{ matrix.config.model }} - # framework: ${{ matrix.config.framework }} - # precision: ${{ matrix.config.precision }} - # tp: ${{ matrix.config.tp }} - # ep: ${{ matrix.config.ep }} - # dp-attn: ${{ matrix.config.dp-attn }} - # conc: ${{ matrix.config.conc }} + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} collect-dsr1-results: - needs: benchmark-dsr1-multi-node - if: ${{ always() }} + needs: [get-dsr1-configs, benchmark-dsr1, benchmark-dsr1-multi-node] + if: ${{ always() && needs.get-dsr1-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: exp-name: "dsr1_1k1k" - # collect-gptoss-results: - # needs: benchmark-gptoss-multi-node - # if: ${{ always() }} - # uses: ./.github/workflows/collect-results.yml - # secrets: inherit - # with: - # exp-name: "gptoss_1k1k" + collect-gptoss-results: + needs: [get-gptoss-configs, benchmark-gptoss-single-node, benchmark-gptoss-multi-node] + if: ${{ always() && needs.get-gptoss-configs.result == 'success' }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit + with: + exp-name: "gptoss_1k1k" calc-success-rate: - needs: [collect-dsr1-results] + needs: [collect-dsr1-results, collect-gptoss-results] if: ${{ always() }} runs-on: ubuntu-latest diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index 098fff636..3bc808817 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -2,14 +2,13 @@ name: "Full Sweep Scheduler - 1k8k" on: workflow_dispatch: - schedule: - - cron: "0 0 * * *" jobs: get-dsr1-configs: runs-on: ubuntu-latest outputs: - search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }} + multi-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.multi-node-search-space-config }} + single-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.single-node-search-space-config }} steps: - name: Checkout code uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 @@ -17,13 +16,16 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT get-gptoss-configs: runs-on: ubuntu-latest outputs: - search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }} + multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }} + single-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.single-node-search-space-config }} steps: - name: Checkout code uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 @@ -31,23 +33,62 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT + + benchmark-dsr1-multi-node: + needs: get-dsr1-configs + if: ${{ needs.get-dsr1-configs.outputs.multi-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: dsr1 1k8k / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }} + secrets: inherit + with: + isl: 1024 + osl: 8192 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: "dsr1_1k8k" + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} benchmark-dsr1: needs: get-dsr1-configs + if: ${{ needs.get-dsr1-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml name: dsr1 1k8k / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} + config: ${{ fromJson(needs.get-dsr1-configs.outputs.single-node-search-space-config) }} secrets: inherit with: exp-name: "dsr1_1k8k" isl: 1024 osl: 8192 - max-model-len: 9216 + max-model-len: 2048 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -58,20 +99,21 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} - benchmark-gptoss: + benchmark-gptoss-single-node: needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml name: gptoss 1k8k / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }} + config: ${{ fromJson(needs.get-gptoss-configs.outputs.single-node-search-space-config) }} secrets: inherit with: exp-name: "gptoss_1k8k" isl: 1024 osl: 8192 - max-model-len: 9216 + max-model-len: 2048 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -82,24 +124,60 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + benchmark-gptoss-multi-node: + needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: dsr1 1k8k / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }} + secrets: inherit + with: + isl: 1024 + osl: 8192 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: "dsr1_1k8k" + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + collect-dsr1-results: - needs: benchmark-dsr1 - if: ${{ always() }} + needs: [get-dsr1-configs, benchmark-dsr1, benchmark-dsr1-multi-node] + if: ${{ always() && needs.get-dsr1-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: exp-name: "dsr1_1k8k" collect-gptoss-results: - needs: benchmark-gptoss - if: ${{ always() }} + needs: [get-gptoss-configs, benchmark-gptoss-single-node, benchmark-gptoss-multi-node] + if: ${{ always() && needs.get-gptoss-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: exp-name: "gptoss_1k8k" calc-success-rate: - needs: [benchmark-dsr1, benchmark-gptoss] + needs: [collect-dsr1-results, collect-gptoss-results] if: ${{ always() }} runs-on: ubuntu-latest diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index 831b49b17..e807f1b65 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -2,14 +2,13 @@ name: "Full Sweep Scheduler - 8k1k" on: workflow_dispatch: - schedule: - - cron: "0 0 * * *" jobs: get-dsr1-configs: runs-on: ubuntu-latest outputs: - search-space-config: ${{ steps.get-dsr1-configs.outputs.search-space-config }} + multi-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.multi-node-search-space-config }} + single-node-search-space-config: ${{ steps.get-dsr1-configs.outputs.single-node-search-space-config }} steps: - name: Checkout code uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 @@ -17,13 +16,16 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT get-gptoss-configs: runs-on: ubuntu-latest outputs: - search-space-config: ${{ steps.get-gptoss-configs.outputs.search-space-config }} + multi-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.multi-node-search-space-config }} + single-node-search-space-config: ${{ steps.get-gptoss-configs.outputs.single-node-search-space-config }} steps: - name: Checkout code uses: actions/checkout@08c6903cd8c0fde910a37f88322edcfb5dd907a8 # v5.0.0 @@ -31,23 +33,62 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss) - echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT + echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT + + benchmark-dsr1-multi-node: + needs: get-dsr1-configs + if: ${{ needs.get-dsr1-configs.outputs.multi-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: dsr1 8k1k / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-dsr1-configs.outputs.multi-node-search-space-config) }} + secrets: inherit + with: + isl: 8192 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: "dsr1_8k1k" + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} benchmark-dsr1: needs: get-dsr1-configs + if: ${{ needs.get-dsr1-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml name: dsr1 8k1k / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.get-dsr1-configs.outputs.search-space-config) }} + config: ${{ fromJson(needs.get-dsr1-configs.outputs.single-node-search-space-config) }} secrets: inherit with: exp-name: "dsr1_8k1k" isl: 8192 osl: 1024 - max-model-len: 9216 + max-model-len: 2048 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -58,20 +99,21 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} - benchmark-gptoss: + benchmark-gptoss-single-node: needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml name: gptoss 8k1k / strategy: fail-fast: false matrix: - config: ${{ fromJson(needs.get-gptoss-configs.outputs.search-space-config) }} + config: ${{ fromJson(needs.get-gptoss-configs.outputs.single-node-search-space-config) }} secrets: inherit with: exp-name: "gptoss_8k1k" isl: 8192 osl: 1024 - max-model-len: 9216 + max-model-len: 2048 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -82,69 +124,60 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} - # This is a workaround until we can integrate GB200 into master configs. - benchmark-gb200: + benchmark-gptoss-multi-node: + needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 8k1k sweep / + name: dsr1 8k1k / strategy: fail-fast: false matrix: - config: - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "off", - } - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "on", - } - - { - "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", - "model": "deepseek-ai/DeepSeek-R1-0528", - "model-prefix": "dsr1", - "precision": "fp8", - "framework": "dynamo-sglang", - "mtp": "off", - } + config: ${{ fromJson(needs.get-gptoss-configs.outputs.multi-node-search-space-config) }} secrets: inherit with: - runner: gb200 + isl: 8192 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_8k1k - isl: 8192 - osl: 1024 - max-model-len: 9216 - mtp-mode: ${{ matrix.config.mtp }} + exp-name: "dsr1_8k1k" + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} collect-dsr1-results: - needs: [benchmark-dsr1, benchmark-gb200] - if: ${{ always() }} + needs: [get-dsr1-configs, benchmark-dsr1, benchmark-dsr1-multi-node] + if: ${{ always() && needs.get-dsr1-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: exp-name: "dsr1_8k1k" collect-gptoss-results: - needs: benchmark-gptoss - if: ${{ always() }} + needs: [get-gptoss-configs, benchmark-gptoss-single-node, benchmark-gptoss-multi-node] + if: ${{ always() && needs.get-gptoss-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: exp-name: "gptoss_8k1k" calc-success-rate: - needs: [benchmark-dsr1, benchmark-gptoss, benchmark-gb200] + needs: [collect-dsr1-results, collect-gptoss-results] if: ${{ always() }} runs-on: ubuntu-latest diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index 801355bdd..9f611ef32 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -30,19 +30,4 @@ bash ./submit_disagg.sh $PREFILL_NODES \ $DECODE_NODES \ $DECODE_NUM_WORKERS \ $N_ADDITIONAL_FRONTENDS \ - $ISL $OSL "${CONC_LIST// /x}" inf - -# if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then -# bash ./submit_disagg.sh $PREFILL_NODES \ -# $PREFILL_NUM_WORKERS \ -# $DECODE_NODES \ -# $DECODE_NUM_WORKERS \ -# $DECODE_NUM_WORKERS \ -# $ISL $OSL "${CONC_LIST// /x}" inf -# elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then -# concurrency_list="128x256x384x448x512x576x1024x2048x4096" -# bash ./submit_disagg.sh 12 6 6 1 8 $ISL $OSL "${CONC_LIST// /x}" inf -# else -# echo "Unsupported ISL/OSL combination: $ISL/$OSL" -# exit 1 -# fi \ No newline at end of file + $ISL $OSL "${CONC_LIST// /x}" inf \ No newline at end of file diff --git a/runners/launch_gb200-nv-copy-2.sh b/runners/launch_gb200-nv-copy-2.sh deleted file mode 100755 index e4d3827c3..000000000 --- a/runners/launch_gb200-nv-copy-2.sh +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/bash - -# This script sets up the environment and launches multi-node benchmarks - -set -x - -# Set up environment variables for SLURM -export SLURM_PARTITION="batch" -export SLURM_ACCOUNT="benchmark" -export SLURM_JOB_NAME="benchmark-dynamo.job" -# For GB200 we have 4 GPUs per node -export NTASKS_PER_NODE=4 - -### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars -if [[ $FRAMEWORK == "dynamo-sglang" ]]; then - export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh" - export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" - export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" -else - SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - - # Update the IMAGE variable to the squash file - export IMAGE=$SQUASH_FILE - - export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2" - export SERVED_MODEL_NAME="deepseek-r1-fp4" -fi - - -export ISL="$ISL" -export OSL="$OSL" - -bash benchmarks/"${EXP_NAME%%_*}_${PRECISION}_gb200_${FRAMEWORK}_slurm.sh" - -# Wait for all jobs to complete -echo "Waiting for all jobs to complete..." -while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do - echo "Jobs still running..." - squeue --steps -u $USER - sleep 30 -done - -# FIXME: The below is bad and is a result of the indirection of the ways in which -# Dynamo jobs are launched. In a follow-up PR, the location of the result file should not -# depend on the runner, it should always be in the same spot in the GH workspace. - -# Process results from all configurations -if [[ $FRAMEWORK == "dynamo-trt" ]]; then - - # Find the logs directory (should be only one for this ISL/OSL combination) - LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) - if [ -z "$LOGS_DIR" ]; then - echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" - exit 1 - fi - - echo "Found logs directory: $LOGS_DIR" - - # Find all result subdirectories in this logs directory - RESULT_SUBDIRS=$(find "$LOGS_DIR" -name "ctx*_gen*_[td]ep*_batch*_eplb*_mtp*" -type d) - - if [ -z "$RESULT_SUBDIRS" ]; then - echo "No result subdirectories found in $LOGS_DIR" - exit 1 - fi - - echo "Found result subdirectories:" - echo "$RESULT_SUBDIRS" - - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Process individual concurrency result files - RESULTS_SUBDIR="$result_subdir/results" - - if [ -d "$RESULTS_SUBDIR" ]; then - echo "Processing results from: $RESULTS_SUBDIR" - - # Find all concurrency result files with new format - CONCURRENCY_FILES=$(find "$RESULTS_SUBDIR" -name "results_concurrency_*_gpus_*.json") - - for result_file in $CONCURRENCY_FILES; do - if [ -f "$result_file" ]; then - # Extract concurrency and GPU count from filename - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed 's/results_concurrency_\([0-9]*\)_gpus_.*\.json/\1/') - gpus=$(echo "$filename" | sed 's/results_concurrency_.*_gpus_\([0-9]*\)\.json/\1/') - echo "Processing concurrency $concurrency with $gpus GPUs: $result_file" - - # Copy the result file to workspace with a unique name - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus${gpus}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" - - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi - done - else - echo "Results subdirectory not found: $RESULTS_SUBDIR" - fi - done -else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement - # Find the latest log directory - # we do "tail -1" here since only the latest job will yield the result - LOGS_DIR=$(find . -path "*/logs/*/vllm_isl_${ISL}_osl_${OSL}" -type d 2>/dev/null | sort -V | tail -1) - if [ -z "$LOGS_DIR" ]; then - echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" - exit 1 - fi - - echo "Found logs directory: $LOGS_DIR" - ls -la $LOGS_DIR - - # Result JSON are contained within the result directory - for result_file in $(find $LOGS_DIR -type f); do - # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json - file_name=$(basename $result_file) - if [ -f $result_file ]; then - # Copy the result file to workspace with a unique name - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" - echo "Found result file ${result_file}. Copying them to ${WORKSPACE_RESULT_FILE}" - cp $result_file $WORKSPACE_RESULT_FILE - fi - done -fi - -echo "All result files processed" \ No newline at end of file diff --git a/runners/launch_gb200-nv-copy.sh b/runners/launch_gb200-nv-copy.sh deleted file mode 100755 index 8c11cb0bf..000000000 --- a/runners/launch_gb200-nv-copy.sh +++ /dev/null @@ -1,245 +0,0 @@ -#!/usr/bin/bash - -# This script sets up the environment and launches multi-node benchmarks - -set -x - -# Set up environment variables for SLURM -export SLURM_PARTITION="batch" -export SLURM_ACCOUNT="benchmark" -export SLURM_JOB_NAME="benchmark-dynamo.job" - -### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars -if [[ $FRAMEWORK == "dynamo-sglang" ]]; then - export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh" - export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" - export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" -else - SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - - # Update the IMAGE variable to the squash file - export IMAGE=$SQUASH_FILE - - export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2" - export SERVED_MODEL_NAME="deepseek-r1-fp4" -fi - - -export ISL="$ISL" -export OSL="$OSL" - -### FRAMEWORK_DIFF_IF_STATEMENT #2 - difference in launching jobs -if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then - - # Set up Dynamo repository path - DYNAMO_PATH="/mnt/lustre01/users/sa-shared/benchmarks/dynamo" - PERFORMANCE_SWEEPS_PATH="$DYNAMO_PATH/components/backends/trtllm/performance_sweeps" - - # Overview: - # The Dynamo repository contains the bench_serving repository as a submodule. - # The submit_disagg.sh script, located at $PERFORMANCE_SWEEPS_PATH, orchestrates the entire benchmarking workflow: - # 1. Launches the Dynamo inference service with the specified configuration. - # 2. Waits for the service to become healthy. - # 3. Initiates benchmarking using the bench_serving tools. - # 4. Monitors all jobs until completion. - # 5. Collects and processes the results. - - # Always clone and setup Dynamo - echo "Cloning Dynamo repository..." - rm -rf "$DYNAMO_PATH" - git clone https://github.com/cquil11/dynamo.git "$DYNAMO_PATH" - cd "$DYNAMO_PATH" - git checkout release/0.5.1-rc0.20251105-cam - git submodule update --init --recursive - - # Navigate to performance sweeps directory - cd "$PERFORMANCE_SWEEPS_PATH" - - # 1. CACHE_TRANSCEIVER_MAX_NUM_TOKENS controls the max_tokens_in_buffer value - # in cache_transceiver_config of TensorRT-LLM context and generation workers. - # Specifically, it is the max number of tokens the transfer buffer can fit. - - # Set up environment variables based on ISL/OSL - if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608 - elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then - export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448 - else - echo "Unsupported ISL/OSL combination: $ISL/$OSL" - exit 1 - fi - - # New stuff - # CONC_LIST - # ISL - # OSL - # IMAGE - - # PREFILL_NUM_WORKERS - # PREFILL_TP - # PREFILL_EP - # PREFILL_DP_ATTN - - # DECODE_NUM_WORKERS - # DECODE_TP - # DECODE_EP - # DECODE_DP_ATTN - - # Additional env vars needed - # PREFILL_MAX_NUM_TOKENS - # PREFILL_MAX_BATCH_SIZE - # DECODE_MAX_NUM_TOKENS - # DECODE_MAX_BATCH_SIZE - # DECODE_GPU_MEM_FRACTION - # DECODE_MTP_SIZE - # DECODE_EPLB_NUM_SLOTS - - echo "CONC_LIST=$CONC_LIST" - echo "ISL=$ISL" - echo "OSL=$OSL" - echo "IMAGE=$IMAGE" - - echo "PREFILL_NUM_WORKERS=$PREFILL_NUM_WORKERS" - echo "PREFILL_TP=$PREFILL_TP" - echo "PREFILL_EP=$PREFILL_EP" - echo "PREFILL_DP_ATTN=$PREFILL_DP_ATTN" - - echo "DECODE_NUM_WORKERS=$DECODE_NUM_WORKERS" - echo "DECODE_TP=$DECODE_TP" - echo "DECODE_EP=$DECODE_EP" - echo "DECODE_DP_ATTN=$DECODE_DP_ATTN" - - echo "PREFILL_MAX_NUM_TOKENS=$PREFILL_MAX_NUM_TOKENS" - echo "PREFILL_MAX_BATCH_SIZE=$PREFILL_MAX_BATCH_SIZE" - echo "DECODE_MAX_NUM_TOKENS=$DECODE_MAX_NUM_TOKENS" - echo "DECODE_MAX_BATCH_SIZE=$DECODE_MAX_BATCH_SIZE" - echo "DECODE_GPU_MEM_FRACTION=$DECODE_GPU_MEM_FRACTION" - echo "DECODE_MTP_SIZE=$DECODE_MTP_SIZE" - echo "DECODE_EPLB_NUM_SLOTS=$DECODE_EPLB_NUM_SLOTS" - - # For GB200, we use 4 tasks per node. - ntasks_per_node=4 - additional_slurm_args="--time=04:00:00" - - kind=dynamo_disagg - - gen_nodes=$(((DECODE_TP + 3)/4 * DECODE_NUM_WORKERS)) - total_nodes=$((PREFILL_NUM_WORKERS + gen_nodes)) - total_tasks=$((total_nodes * ntasks_per_node)) - - # 4608 prefill max num toks originally - if [ $ISL == $OSL ]; then - sbatch --nodes=${total_nodes} \ - --ntasks=${total_tasks} \ - --ntasks-per-node=${ntasks_per_node} \ - --segment=${total_nodes} ${additional_slurm_args} \ - benchmark_disagg.slurm \ - ${PREFILL_NUM_WORKERS} ${PREFILL_TP} \ - ${PREFILL_MAX_BATCH_SIZE} ${PREFILL_MAX_NUM_TOKENS} \ - ${PREFILL_DP_ATTN} ${DECODE_NUM_WORKERS} \ - ${DECODE_TP} ${DECODE_MAX_BATCH_SIZE} \ - ${DECODE_MAX_NUM_TOKENS} ${DECODE_DP_ATTN} \ - ${DECODE_GPU_MEM_FRACTION} ${DECODE_EPLB_NUM_SLOTS} \ - ${DECODE_MTP_SIZE} ${CONC_LIST} \ - ${gen_nodes} ${kind} \ - ${MODEL_PATH} ${SERVED_MODEL_NAME} \ - ${IMAGE} ${ISL} ${OSL} - # else - # sbatch --nodes=${total_nodes} --ntasks=${total_tasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_nodes} ${slurm_args} benchmark_disagg.slurm ${ctx_num} 4 1 8448 true ${gen_num} ${gen_tp_size} ${gen_batch_size} ${gen_max_num_tokens} ${gen_enable_attention_dp} ${gen_gpu_memory_fraction} ${gen_eplb_num_slots} ${gen_mtp_size} "${gen_concurrency_list}" ${gen_nodes} ${kind} ${MODEL_PATH} ${SERVED_MODEL_NAME} ${IMAGE} ${ISL} ${OSL} - fi -fi - -# Wait for all jobs to complete -# echo "Waiting for all jobs to complete..." -while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do - echo "Jobs still running..." - squeue --steps -u $USER - sleep 60 -done - -### FRAMEWORK_DIFF_IF_STATEMENT #3 - difference in log post-processing -if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then - - # Find the logs directory (should be only one for this ISL/OSL combination) - LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) - if [ -z "$LOGS_DIR" ]; then - echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" - exit 1 - fi - - echo "Found logs directory: $LOGS_DIR" - - # Find all result subdirectories in this logs directory - RESULT_SUBDIRS=$(find "$LOGS_DIR" -name "ctx*_gen*_[td]ep*_batch*_eplb*_mtp*" -type d) - - if [ -z "$RESULT_SUBDIRS" ]; then - echo "No result subdirectories found in $LOGS_DIR" - exit 1 - fi - - echo "Found result subdirectories:" - echo "$RESULT_SUBDIRS" - - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" - - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") - - # Process individual concurrency result files - RESULTS_SUBDIR="$result_subdir/results" - - if [ -d "$RESULTS_SUBDIR" ]; then - echo "Processing results from: $RESULTS_SUBDIR" - - # Find all concurrency result files with new format - CONCURRENCY_FILES=$(find "$RESULTS_SUBDIR" -name "results_concurrency_*_gpus_*.json") - - for result_file in $CONCURRENCY_FILES; do - if [ -f "$result_file" ]; then - # Extract concurrency and GPU count from filename - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed 's/results_concurrency_\([0-9]*\)_gpus_.*\.json/\1/') - gpus=$(echo "$filename" | sed 's/results_concurrency_.*_gpus_\([0-9]*\)\.json/\1/') - echo "Processing concurrency $concurrency with $gpus GPUs: $result_file" - - # Copy the result file to workspace with a unique name - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus${gpus}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" - - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi - done - else - echo "Results subdirectory not found: $RESULTS_SUBDIR" - fi - done - -else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement - # Find the latest log directory - # we do "tail -1" here since only the latest job will yield the result - LOGS_DIR=$(find logs/*/vllm_isl_${ISL}_osl_${OSL} -type d | sort -V | tail -1) - if [ -z "$LOGS_DIR" ]; then - echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" - exit 1 - fi - - echo "Found logs directory: $LOGS_DIR" - ls $LOGS_DIR - - # Result JSON are contained within the result directory - for result_file in $(find $LOGS_DIR -type f); do - # result_file should directly be isl_ISL_osl_OSL_concurrency_CONC_req_rate_R_gpus_N_ctx_M_gen_N.json - file_name=$(basename $result_file) - if [ -f $result_file ]; then - # Copy the result file to workspace with a unique name - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${file_name}" - echo "Found result file ${result_file}. Copying them to ${WORKSPACE_RESULT_FILE}" - cp $result_file $WORKSPACE_RESULT_FILE - fi - done -fi - -echo "All result files processed" \ No newline at end of file diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index bf41e1e12..e4d3827c3 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -2,11 +2,14 @@ # This script sets up the environment and launches multi-node benchmarks +set -x # Set up environment variables for SLURM export SLURM_PARTITION="batch" export SLURM_ACCOUNT="benchmark" export SLURM_JOB_NAME="benchmark-dynamo.job" +# For GB200 we have 4 GPUs per node +export NTASKS_PER_NODE=4 ### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars if [[ $FRAMEWORK == "dynamo-sglang" ]]; then @@ -28,179 +31,22 @@ fi export ISL="$ISL" export OSL="$OSL" -### FRAMEWORK_DIFF_IF_STATEMENT #2 - difference in launching jobs -if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then - - # Set up Dynamo repository path - DYNAMO_PATH="/mnt/lustre01/users/sa-shared/benchmarks/dynamo" - PERFORMANCE_SWEEPS_PATH="$DYNAMO_PATH/components/backends/trtllm/performance_sweeps" - - # Overview: - # The Dynamo repository contains the bench_serving repository as a submodule. - # The submit_disagg.sh script, located at $PERFORMANCE_SWEEPS_PATH, orchestrates the entire benchmarking workflow: - # 1. Launches the Dynamo inference service with the specified configuration. - # 2. Waits for the service to become healthy. - # 3. Initiates benchmarking using the bench_serving tools. - # 4. Monitors all jobs until completion. - # 5. Collects and processes the results. - - # Always clone and setup Dynamo - echo "Cloning Dynamo repository..." - rm -rf "$DYNAMO_PATH" - git clone https://github.com/ai-dynamo/dynamo.git "$DYNAMO_PATH" - cd "$DYNAMO_PATH" - git checkout release/0.5.1-rc0.20251105 - git submodule update --init --recursive - - # Navigate to performance sweeps directory - cd "$PERFORMANCE_SWEEPS_PATH" - - # 1. CACHE_TRANSCEIVER_MAX_NUM_TOKENS controls the max_tokens_in_buffer value - # in cache_transceiver_config of TensorRT-LLM context and generation workers. - # Specifically, it is the max number of tokens the transfer buffer can fit. - - # Set up environment variables based on ISL/OSL - if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608 - elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then - export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448 - else - echo "Unsupported ISL/OSL combination: $ISL/$OSL" - exit 1 - fi - - # Generate benchmark configurations based on ISL/OSL and MTP mode - generate_benchmark_configs() { - local isl="$1" - local osl="$2" - local mtp_mode="$3" - - # Usage: - # ./submit_disagg.sh [ctx_num] [gen_num] [gen_tp_size] [gen_batch_size] [gen_max_num_tokens] [gen_gpu_memory_fraction] [gen_eplb_num_slots] [gen_mtp_size] [gen_concurrency_list]" - # MTP Modes: - # mtp=off - Run without Multi-Token Prediction (gen_mtp_size=0) - # mtp=on - Run with Multi-Token Prediction (gen_mtp_size=1,2,3) - # Execution Modes: - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - # dep - Run Data-Expert Parallel mode (attention_dp=true) - # Parameters for tep/dep modes: - # ctx_num: Number of context nodes - # gen_num: Number of generation nodes - # gen_tp_size: Generation tensor parallel size - # gen_batch_size: Generation batch size - # gen_max_num_tokens: Generation max number of tokens - # gen_gpu_memory_fraction: GPU memory fraction (0.7-0.95) - # gen_mtp_size: Multi-Token Prediction size (0 for mtp=off, 1-3 for mtp=on) - # gen_eplb_num_slots: Expert load balancing slots (0, 256, 288) - # gen_concurrency_list: Concurrency values (space-separated, quoted) - - if [ "$isl" = "1024" ] && [ "$osl" = "1024" ]; then - if [ "$mtp_mode" = "on" ]; then - echo "Running 1k/1k MTP=ON configurations" - - ./submit_disagg.sh "mtp=on" "tep" 1 4 8 32 128 "0.9" 3 0 "1 2 4 8 16 36" - - ./submit_disagg.sh "mtp=on" "dep" 1 1 16 64 256 "0.7" 3 0 "512 1075" - - ./submit_disagg.sh "mtp=on" "dep" 2 1 16 128 256 "0.7" 1 0 "2150" - - ./submit_disagg.sh "mtp=on" "dep" 1 1 32 16 64 "0.6" 3 0 "512" - - ./submit_disagg.sh "mtp=on" "dep" 1 1 8 256 512 "0.8" 1 0 "2252" - else - echo "Running 1k/1k MTP=OFF configurations" - - ./submit_disagg.sh "mtp=off" "tep" 1 4 8 128 128 "0.9" 0 0 "1 2 4 8 16 32 64 141" - - ./submit_disagg.sh "mtp=off" "dep" 1 1 32 32 32 "0.7" 0 0 "1075" - - ./submit_disagg.sh "mtp=off" "dep" 1 1 16 64 64 "0.75" 0 0 "1075" - - ./submit_disagg.sh "mtp=off" "dep" 2 1 16 256 256 "0.75" 0 0 "2048 4300" - - ./submit_disagg.sh "mtp=off" "dep" 1 1 8 512 512 "0.8" 0 0 "4300" - fi - elif [ "$isl" = "8192" ] && [ "$osl" = "1024" ]; then - if [ "$mtp_mode" = "on" ]; then - echo "Running 8k/1k MTP=ON configurations" - - ./submit_disagg.sh "mtp=on" "tep" 1 3 8 16 64 "0.9" 3 0 "1 2 4 8 18" - - ./submit_disagg.sh "mtp=on" "dep" 5 1 32 8 32 "0.7" 3 0 "128 269" - - ./submit_disagg.sh "mtp=on" "dep" 8 1 32 16 64 "0.7" 3 0 "538" - - ./submit_disagg.sh "mtp=on" "dep" 8 1 16 64 256 "0.75" 2 0 "1075" - - ./submit_disagg.sh "mtp=on" "dep" 6 1 8 256 512 "0.8" 1 0 "2150" - else - echo "Running 8k/1k MTP=OFF configurations" - - ./submit_disagg.sh "mtp=off" "tep" 1 3 8 32 32 "0.9" 0 0 "1 2 4 8 16 34" - - ./submit_disagg.sh "mtp=off" "dep" 4 1 32 16 16 "0.7" 0 0 "256 538" - - ./submit_disagg.sh "mtp=off" "dep" 6 1 16 64 64 "0.75" 0 0 "1075" - - ./submit_disagg.sh "mtp=off" "dep" 8 1 16 128 128 "0.75" 0 0 "2150" - - ./submit_disagg.sh "mtp=off" "dep" 5 1 8 256 256 "0.8" 0 0 "2150" - fi - else - echo "Unsupported ISL/OSL combination: $isl/$osl" - exit 1 - fi - } - - # Run all benchmark configurations - generate_benchmark_configs "$ISL" "$OSL" "$MTP_MODE" - -else # if statement at the top - search for "FRAMEWORK_DIFF_IF_STATEMENT #2" - # Set up Dynamo repository path - DYNAMO_PATH="/mnt/lustre01/users/sa-shared/benchmarks/dynamo" - SGL_SLURM_JOBS_PATH="$DYNAMO_PATH/components/backends/sglang/slurm_jobs" - - # Always clone and setup Dynamo - echo "Cloning Dynamo repository..." - rm -rf "$DYNAMO_PATH" - git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git $DYNAMO_PATH - cd "$DYNAMO_PATH" - - # Navigate to corresponding directory - cd "$SGL_SLURM_JOBS_PATH" - - # Set up SGL launch script-specific environment variables - export SLURM_ACCOUNT=$SLURM_ACCOUNT - export SLURM_PARTITION=$SLURM_PARTITION - export TIME_LIMIT="04:00:00" - export MODEL_PATH=$MODEL_PATH - export CONFIG_DIR=$CONFIG_DIR - export CONTAINER_IMAGE=$IMAGE - - # Launch jobs based on ISL/OSL - if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - concurrency_list="1024x2048x4096x4608x4864x4992x5120x5376x5632x6144x8192" - bash ./submit_disagg.sh 6 3 12 1 8 $ISL $OSL $concurrency_list inf - elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then - concurrency_list="128x256x384x448x512x576x1024x2048x4096" - bash ./submit_disagg.sh 12 6 6 1 8 $ISL $OSL $concurrency_list inf - else - echo "Unsupported ISL/OSL combination: $ISL/$OSL" - exit 1 - fi -fi +bash benchmarks/"${EXP_NAME%%_*}_${PRECISION}_gb200_${FRAMEWORK}_slurm.sh" # Wait for all jobs to complete echo "Waiting for all jobs to complete..." while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do echo "Jobs still running..." - squeue -u $USER - sleep 60 + squeue --steps -u $USER + sleep 30 done -echo "All jobs completed" -### FRAMEWORK_DIFF_IF_STATEMENT #3 - difference in log post-processing -if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then +# FIXME: The below is bad and is a result of the indirection of the ways in which +# Dynamo jobs are launched. In a follow-up PR, the location of the result file should not +# depend on the runner, it should always be in the same spot in the GH workspace. + +# Process results from all configurations +if [[ $FRAMEWORK == "dynamo-trt" ]]; then # Find the logs directory (should be only one for this ISL/OSL combination) LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) @@ -257,18 +103,17 @@ if [[ $FRAMEWORK == "dynamo-trtllm" ]]; then echo "Results subdirectory not found: $RESULTS_SUBDIR" fi done - else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement # Find the latest log directory # we do "tail -1" here since only the latest job will yield the result - LOGS_DIR=$(find logs/*/vllm_isl_${ISL}_osl_${OSL} -type d | sort -V | tail -1) + LOGS_DIR=$(find . -path "*/logs/*/vllm_isl_${ISL}_osl_${OSL}" -type d 2>/dev/null | sort -V | tail -1) if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 fi echo "Found logs directory: $LOGS_DIR" - ls $LOGS_DIR + ls -la $LOGS_DIR # Result JSON are contained within the result directory for result_file in $(find $LOGS_DIR -type f); do @@ -283,4 +128,4 @@ else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement done fi -echo "All result files processed" +echo "All result files processed" \ No newline at end of file diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 58bfa6c99..3943c317d 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -205,9 +205,6 @@ def generate_full_sweep(args, all_config_data, runner_data): if conc > conc_end: conc = conc_end - if len(matrix_values) == 0: - raise ValueError("No configs found matching input filters.") - return matrix_values From 01be2d594a927ac912b9d138febbfb85a295aba3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 22:45:39 -0600 Subject: [PATCH 44/98] polishing pt 2 --- .../workflows/full-sweep-1k1k-scheduler.yml | 72 ++++++++------- .../workflows/full-sweep-1k8k-scheduler.yml | 72 ++++++++------- .../workflows/full-sweep-8k1k-scheduler.yml | 92 ++++++++++--------- 3 files changed, 133 insertions(+), 103 deletions(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 220abdb5c..c5c7a456d 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -42,7 +42,7 @@ jobs: needs: get-dsr1-configs if: ${{ needs.get-dsr1-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: dsr1 1k1k / + name: dsr1 1k1k multi-node / strategy: fail-fast: false matrix: @@ -74,11 +74,11 @@ jobs: decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} - benchmark-dsr1: + benchmark-dsr1-single-node: needs: get-dsr1-configs if: ${{ needs.get-dsr1-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 1k1k / + name: dsr1 1k1k single-node / strategy: fail-fast: false matrix: @@ -99,36 +99,11 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} - benchmark-gptoss-single-node: - needs: get-gptoss-configs - if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-gptoss-configs.outputs.single-node-search-space-config) }} - secrets: inherit - with: - exp-name: "gptoss_1k1k" - isl: 1024 - osl: 1024 - max-model-len: 2048 - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - benchmark-gptoss-multi-node: needs: get-gptoss-configs if: ${{ needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: dsr1 1k1k / + name: dsr1 1k1k multi-node / strategy: fail-fast: false matrix: @@ -160,8 +135,38 @@ jobs: decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + benchmark-gptoss-single-node: + needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: gptoss 1k1k / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-gptoss-configs.outputs.single-node-search-space-config) }} + secrets: inherit + with: + exp-name: "gptoss_1k1k" + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + collect-dsr1-results: - needs: [get-dsr1-configs, benchmark-dsr1, benchmark-dsr1-multi-node] + needs: + [ + get-dsr1-configs, + benchmark-dsr1-single-node, + benchmark-dsr1-multi-node, + ] if: ${{ always() && needs.get-dsr1-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit @@ -169,7 +174,12 @@ jobs: exp-name: "dsr1_1k1k" collect-gptoss-results: - needs: [get-gptoss-configs, benchmark-gptoss-single-node, benchmark-gptoss-multi-node] + needs: + [ + get-gptoss-configs, + benchmark-gptoss-single-node, + benchmark-gptoss-multi-node, + ] if: ${{ always() && needs.get-gptoss-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index 3bc808817..f9200585c 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -42,7 +42,7 @@ jobs: needs: get-dsr1-configs if: ${{ needs.get-dsr1-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: dsr1 1k8k / + name: dsr1 1k8k multi-node / strategy: fail-fast: false matrix: @@ -74,11 +74,11 @@ jobs: decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} - benchmark-dsr1: + benchmark-dsr1-single-node: needs: get-dsr1-configs if: ${{ needs.get-dsr1-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 1k8k / + name: dsr1 1k8k single-node / strategy: fail-fast: false matrix: @@ -99,36 +99,11 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} - benchmark-gptoss-single-node: - needs: get-gptoss-configs - if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k8k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-gptoss-configs.outputs.single-node-search-space-config) }} - secrets: inherit - with: - exp-name: "gptoss_1k8k" - isl: 1024 - osl: 8192 - max-model-len: 2048 - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - benchmark-gptoss-multi-node: needs: get-gptoss-configs if: ${{ needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: dsr1 1k8k / + name: dsr1 1k8k multi-node / strategy: fail-fast: false matrix: @@ -160,8 +135,38 @@ jobs: decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + benchmark-gptoss-single-node: + needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: gptoss 1k8k / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-gptoss-configs.outputs.single-node-search-space-config) }} + secrets: inherit + with: + exp-name: "gptoss_1k8k" + isl: 1024 + osl: 8192 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + collect-dsr1-results: - needs: [get-dsr1-configs, benchmark-dsr1, benchmark-dsr1-multi-node] + needs: + [ + get-dsr1-configs, + benchmark-dsr1-single-node, + benchmark-dsr1-multi-node, + ] if: ${{ always() && needs.get-dsr1-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit @@ -169,7 +174,12 @@ jobs: exp-name: "dsr1_1k8k" collect-gptoss-results: - needs: [get-gptoss-configs, benchmark-gptoss-single-node, benchmark-gptoss-multi-node] + needs: + [ + get-gptoss-configs, + benchmark-gptoss-single-node, + benchmark-gptoss-multi-node, + ] if: ${{ always() && needs.get-gptoss-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index e807f1b65..4e1941e50 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -1,4 +1,4 @@ -name: "Full Sweep Scheduler - 8k1k" +name: "Full Sweep Scheduler - 1k1k" on: workflow_dispatch: @@ -16,8 +16,8 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT @@ -33,8 +33,8 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT @@ -42,7 +42,7 @@ jobs: needs: get-dsr1-configs if: ${{ needs.get-dsr1-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: dsr1 8k1k / + name: dsr1 1k1k multi-node / strategy: fail-fast: false matrix: @@ -57,7 +57,7 @@ jobs: model: ${{ matrix.config.model }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - exp-name: "dsr1_8k1k" + exp-name: "dsr1_1k1k" conc-list: ${{ toJson(matrix.config.conc) }} spec-decoding: ${{ matrix.config.spec-decoding }} disagg: ${{ matrix.config.disagg }} @@ -74,43 +74,18 @@ jobs: decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} - benchmark-dsr1: + benchmark-dsr1-single-node: needs: get-dsr1-configs if: ${{ needs.get-dsr1-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 8k1k / + name: dsr1 1k1k single-node / strategy: fail-fast: false matrix: config: ${{ fromJson(needs.get-dsr1-configs.outputs.single-node-search-space-config) }} secrets: inherit with: - exp-name: "dsr1_8k1k" - isl: 8192 - osl: 1024 - max-model-len: 2048 - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - benchmark-gptoss-single-node: - needs: get-gptoss-configs - if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 8k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-gptoss-configs.outputs.single-node-search-space-config) }} - secrets: inherit - with: - exp-name: "gptoss_8k1k" + exp-name: "dsr1_1k1k" isl: 8192 osl: 1024 max-model-len: 2048 @@ -128,7 +103,7 @@ jobs: needs: get-gptoss-configs if: ${{ needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: dsr1 8k1k / + name: dsr1 1k1k multi-node / strategy: fail-fast: false matrix: @@ -143,7 +118,7 @@ jobs: model: ${{ matrix.config.model }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - exp-name: "dsr1_8k1k" + exp-name: "dsr1_1k1k" conc-list: ${{ toJson(matrix.config.conc) }} spec-decoding: ${{ matrix.config.spec-decoding }} disagg: ${{ matrix.config.disagg }} @@ -160,21 +135,56 @@ jobs: decode-dp-attn: ${{ matrix.config.decode.dp-attn }} decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + benchmark-gptoss-single-node: + needs: get-gptoss-configs + if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} + uses: ./.github/workflows/benchmark-tmpl.yml + name: gptoss 1k1k / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-gptoss-configs.outputs.single-node-search-space-config) }} + secrets: inherit + with: + exp-name: "gptoss_1k1k" + isl: 8192 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + tp: ${{ matrix.config.tp }} + ep: ${{ matrix.config.ep }} + dp-attn: ${{ matrix.config.dp-attn }} + conc: ${{ matrix.config.conc }} + collect-dsr1-results: - needs: [get-dsr1-configs, benchmark-dsr1, benchmark-dsr1-multi-node] + needs: + [ + get-dsr1-configs, + benchmark-dsr1-single-node, + benchmark-dsr1-multi-node, + ] if: ${{ always() && needs.get-dsr1-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: - exp-name: "dsr1_8k1k" + exp-name: "dsr1_1k1k" collect-gptoss-results: - needs: [get-gptoss-configs, benchmark-gptoss-single-node, benchmark-gptoss-multi-node] + needs: + [ + get-gptoss-configs, + benchmark-gptoss-single-node, + benchmark-gptoss-multi-node, + ] if: ${{ always() && needs.get-gptoss-configs.result == 'success' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: - exp-name: "gptoss_8k1k" + exp-name: "gptoss_1k1k" calc-success-rate: needs: [collect-dsr1-results, collect-gptoss-results] From 825712b0cd2dd5da38ba3929c1df0d5d16dea9b0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 22:46:59 -0600 Subject: [PATCH 45/98] polishing pt 3 --- .github/workflows/full-sweep-1k1k-scheduler.yml | 2 +- .github/workflows/full-sweep-1k8k-scheduler.yml | 2 +- .github/workflows/full-sweep-8k1k-scheduler.yml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index c5c7a456d..a4348f114 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -139,7 +139,7 @@ jobs: needs: get-gptoss-configs if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k1k / + name: gptoss 1k1k single-node / strategy: fail-fast: false matrix: diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index f9200585c..c099ac8c6 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -139,7 +139,7 @@ jobs: needs: get-gptoss-configs if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k8k / + name: gptoss 1k8k single-node / strategy: fail-fast: false matrix: diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index 4e1941e50..092f94639 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -139,7 +139,7 @@ jobs: needs: get-gptoss-configs if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k1k / + name: gptoss 1k1k single-node / strategy: fail-fast: false matrix: From e9e793388380f6fe4c9e2d58f4d5d34fb819f542 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 26 Nov 2025 23:08:58 -0600 Subject: [PATCH 46/98] polishing pt 4 --- .../workflows/benchmark-multinode-tmpl.yml | 3 +- .../workflows/full-sweep-1k1k-scheduler.yml | 2 +- .../workflows/full-sweep-1k8k-scheduler.yml | 2 +- .../workflows/full-sweep-8k1k-scheduler.yml | 30 +++++++++---------- 4 files changed, 18 insertions(+), 19 deletions(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index 9c5d98ac4..d5561739f 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -131,10 +131,9 @@ jobs: RUNNER_NAME: ${{ runner.name }} RESULT_FILENAME: ${{ env.EXP_NAME }}_${{ env.PRECISION }}_ptp${{ env.PREFILL_TP }}pep${{ env.PREFILL_EP}}_dtp${{ env.DECODE_TP}}dep${{ env.DECODE_EP }}_${{ env.FRAMEWORK }}_specdec-${{ env.SPEC_DECODING }}_conc${{ join(fromJson(inputs.conc-list), 'x') }}_${{ runner.name }} run: | - # bash ./runners/launch_${RUNNER_NAME%%_*}.sh set -x export ${{ join(fromJson(inputs.prefill-additional-settings), ' ') }} ${{ join(fromJson(inputs.decode-additional-settings), ' ') }} - bash ./runners/launch_gb200-nv-copy-2.sh + bash ./runners/launch_${RUNNER_NAME%%_*}.sh # Check if at least one result file was created if ls ${RESULT_FILENAME}_*.json 1> /dev/null 2>&1; then echo "RESULT_FILENAME=${RESULT_FILENAME}" >> $GITHUB_ENV diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 1a5a60c22..11af50c48 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -103,7 +103,7 @@ jobs: needs: get-gptoss-configs if: ${{ needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: dsr1 1k1k multi-node / + name: gptoss 1k1k multi-node / strategy: fail-fast: false matrix: diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index 45102f1de..29963c44a 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -103,7 +103,7 @@ jobs: needs: get-gptoss-configs if: ${{ needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: dsr1 1k8k multi-node / + name: gptoss 1k8k multi-node / strategy: fail-fast: false matrix: diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index 909c6ae30..32b2e47dc 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -1,4 +1,4 @@ -name: "Full Sweep Scheduler - 1k1k" +name: "Full Sweep Scheduler - 8k1k" on: workflow_dispatch: @@ -16,8 +16,8 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT @@ -33,8 +33,8 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT @@ -42,7 +42,7 @@ jobs: needs: get-dsr1-configs if: ${{ needs.get-dsr1-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: dsr1 1k1k multi-node / + name: dsr1 8k1k multi-node / strategy: fail-fast: false matrix: @@ -57,7 +57,7 @@ jobs: model: ${{ matrix.config.model }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - exp-name: "dsr1_1k1k" + exp-name: "dsr1_8k1k" conc-list: ${{ toJson(matrix.config.conc) }} spec-decoding: ${{ matrix.config.spec-decoding }} disagg: ${{ matrix.config.disagg }} @@ -78,14 +78,14 @@ jobs: needs: get-dsr1-configs if: ${{ needs.get-dsr1-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 1k1k single-node / + name: dsr1 8k1k single-node / strategy: fail-fast: false matrix: config: ${{ fromJson(needs.get-dsr1-configs.outputs.single-node-search-space-config) }} secrets: inherit with: - exp-name: "dsr1_1k1k" + exp-name: "dsr1_8k1k" isl: 8192 osl: 1024 max-model-len: 2048 @@ -103,7 +103,7 @@ jobs: needs: get-gptoss-configs if: ${{ needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: dsr1 1k1k multi-node / + name: gptoss 8k1k multi-node / strategy: fail-fast: false matrix: @@ -118,7 +118,7 @@ jobs: model: ${{ matrix.config.model }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} - exp-name: "dsr1_1k1k" + exp-name: "dsr1_8k1k" conc-list: ${{ toJson(matrix.config.conc) }} spec-decoding: ${{ matrix.config.spec-decoding }} disagg: ${{ matrix.config.disagg }} @@ -139,14 +139,14 @@ jobs: needs: get-gptoss-configs if: ${{ needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' }} uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k1k single-node / + name: gptoss 8k1k single-node / strategy: fail-fast: false matrix: config: ${{ fromJson(needs.get-gptoss-configs.outputs.single-node-search-space-config) }} secrets: inherit with: - exp-name: "gptoss_1k1k" + exp-name: "gptoss_8k1k" isl: 8192 osl: 1024 max-model-len: 2048 @@ -171,7 +171,7 @@ jobs: uses: ./.github/workflows/collect-results.yml secrets: inherit with: - exp-name: "dsr1_1k1k" + exp-name: "dsr1_8k1k" collect-gptoss-results: needs: @@ -184,7 +184,7 @@ jobs: uses: ./.github/workflows/collect-results.yml secrets: inherit with: - exp-name: "gptoss_1k1k" + exp-name: "gptoss_8k1k" calc-success-rate: needs: [collect-dsr1-results, collect-gptoss-results] From 95a05aaf4de4633f1e2f965653afa3bf6d82f308 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 18:46:57 -0600 Subject: [PATCH 47/98] fixing summarize.py --- utils/summarize.py | 93 +++++++++++++++++++++++++++++++--------------- 1 file changed, 64 insertions(+), 29 deletions(-) diff --git a/utils/summarize.py b/utils/summarize.py index 503da2690..93035b741 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -9,36 +9,71 @@ with open(result_path) as f: result = json.load(f) results.append(result) -results.sort(key=lambda r: (r.get('model', 'unknown'), r['hw'], r.get('framework', 'vllm'), r.get('precision', 'fp8'), r.get('isl', 0), r.get('osl', 0), r['tp'], r['ep'], r['conc'])) -summary_header = f'''\ +is_multinode = results[0]['is_multinode'] + +if is_multinode: + results.sort(key=lambda r: (r['model'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc'])) + + summary_header = '''\ +| Model | Hardware | Framework | Precision | ISL | OSL | Prefill TP | Prefill EP | Prefill DP Attn | Prefill Workers | Prefill GPUs | Decode TP | Decode EP | Decode DP Attn | Decode Workers | Decode GPUs | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | +| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ +''' + print(summary_header) + + for result in results: + print( + f"| {result['model']} " + f"| {result['hw'].upper()} " + f"| {result['framework'].upper()} " + f"| {result['precision'].upper()} " + f"| {result['isl']} " + f"| {result['osl']} " + f"| {result['prefill_tp']} " + f"| {result['prefill_ep']} " + f"| {result['prefill_dp_attention']} " + f"| {result['prefill_num_workers']} " + f"| {result['num_prefill_gpu']} " + f"| {result['decode_tp']} " + f"| {result['decode_ep']} " + f"| {result['decode_dp_attention']} " + f"| {result['decode_num_workers']} " + f"| {result['num_decode_gpu']} " + f"| {result['conc']} " + f"| {(result['median_ttft'] * 1000):.4f} " + f"| {(result['median_tpot'] * 1000):.4f} " + f"| {result['median_intvty']:.4f} " + f"| {result['median_e2el']:.4f} " + f"| {result['tput_per_gpu']:.4f} " + f"| {result['output_tput_per_gpu']:.4f} " + f"| {result['input_tput_per_gpu']:.4f} |" + ) +else: + results.sort(key=lambda r: (r['model'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'])) + + summary_header = '''\ | Model | Hardware | Framework | Precision | ISL | OSL | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' -print(summary_header) - -for result in results: - framework = result.get('framework', 'vllm') - precision = result.get('precision', 'fp8') - model = result.get('model', 'unknown') - isl = result.get('isl', 'N/A') - osl = result.get('osl', 'N/A') - print( - f"| {model} " - f"| {result['hw'].upper()} " - f"| {framework.upper()} " - f"| {precision.upper()} " - f"| {isl} " - f"| {osl} " - f"| {result['tp']} " - f"| {result['ep']} " - f"| {result['dp_attention']} " - f"| {result['conc']} " - f"| {(result['median_ttft'] * 1000):.4f} " - f"| {(result['median_tpot'] * 1000):.4f} " - f"| {result['median_intvty']:.4f} " - f"| {result['median_e2el']:.4f} " - f"| {result['tput_per_gpu']:.4f} " - f"| {result['output_tput_per_gpu']:.4f} " - f"| {result['input_tput_per_gpu']:.4f} |" - ) + print(summary_header) + + for result in results: + print( + f"| {result['model']} " + f"| {result['hw'].upper()} " + f"| {result['framework'].upper()} " + f"| {result['precision'].upper()} " + f"| {result['isl']} " + f"| {result['osl']} " + f"| {result['tp']} " + f"| {result['ep']} " + f"| {result['dp_attention']} " + f"| {result['conc']} " + f"| {(result['median_ttft'] * 1000):.4f} " + f"| {(result['median_tpot'] * 1000):.4f} " + f"| {result['median_intvty']:.4f} " + f"| {result['median_e2el']:.4f} " + f"| {result['tput_per_gpu']:.4f} " + f"| {result['output_tput_per_gpu']:.4f} " + f"| {result['input_tput_per_gpu']:.4f} |" + ) From da5e44cd227a40d87bd776a5a04b6a67a0f22785 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 18:50:34 -0600 Subject: [PATCH 48/98] polishing --- benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh index 29efb8dbb..4a67cb347 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -17,15 +17,12 @@ else DECODE_MTP_SIZE="0" fi -PERFORMANCE_SWEEPS_PATH="components/backends/trtllm/performance_sweeps" - echo "Cloning Dynamo repository..." -git clone https://github.com/cquil11/dynamo.git -cd dynamo -git checkout release/0.5.1-rc0.20251105-cam +git clone https://github.com/ai-dynamo/dynamo.git +git checkout release/0.5.1-rc0.20251105 git submodule update --init --recursive -cd "$PERFORMANCE_SWEEPS_PATH" +cd dynamo/components/backends/trtllm/performance_sweeps # Set up environment variables based on ISL/OSL if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then From a8656872b16b769c2c6a2aef462ae18bb4588b09 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 18:53:34 -0600 Subject: [PATCH 49/98] testing --- .github/configs/nvidia-master.yaml | 634 ++++++++++++++--------------- 1 file changed, 317 insertions(+), 317 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ea6d9026d..2dd8711df 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1,330 +1,330 @@ -dsr1-fp4-b200-sglang: - image: lmsysorg/sglang:v0.5.5-cu129-amd64 - model: nvidia/DeepSeek-R1-0528-FP4-V2 - model-prefix: dsr1 - runner: b200 - precision: fp4 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } +# dsr1-fp4-b200-sglang: +# image: lmsysorg/sglang:v0.5.5-cu129-amd64 +# model: nvidia/DeepSeek-R1-0528-FP4-V2 +# model-prefix: dsr1 +# runner: b200 +# precision: fp4 +# framework: sglang +# multinode: false +# seq-len-configs: +# - isl: 1024 +# osl: 1024 +# search-space: +# - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } +# - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } +# - isl: 1024 +# osl: 8192 +# search-space: +# - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } +# - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } +# - isl: 8192 +# osl: 1024 +# search-space: +# - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } +# - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } -dsr1-fp4-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - model: nvidia/DeepSeek-R1-0528-FP4-V2 - model-prefix: dsr1 - runner: b200-trt - precision: fp4 - framework: trt - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # If TP=4, - # If CONC > 32, then EP=4 - # If CONC >= 256, DP_ATTN=true - - { tp: 4, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } - # If TP=8, - # If CONC > 8, then EP=8 - # If CONC >= 256, DP_ATTN=true - - { tp: 8, conc-start: 4, conc-end: 8 } - - { tp: 8, ep: 8, conc-start: 16, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } - - isl: 1024 - osl: 8192 - search-space: - # If TP=4, - # If CONC > 32, then EP=4 - # If CONC >= 256, DP_ATTN=true - - { tp: 4, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } - # If TP=8, - # If CONC > 16, then EP=8 - # If CONC >= 256, DP_ATTN=true - - { tp: 8, conc-start: 4, conc-end: 16 } - - { tp: 8, ep: 8, conc-start: 32, conc-end: 128 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } - - isl: 8192 - osl: 1024 - search-space: - # If TP=4, - # If CONC > 32, then EP=4 and DP_ATTN=true - - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } - - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 } - # If TP=8, - # If CONC > 32, then EP=8 and DP_ATTN=true - - { tp: 8, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } +# dsr1-fp4-b200-trt: +# image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 +# model: nvidia/DeepSeek-R1-0528-FP4-V2 +# model-prefix: dsr1 +# runner: b200-trt +# precision: fp4 +# framework: trt +# multinode: false +# seq-len-configs: +# - isl: 1024 +# osl: 1024 +# search-space: +# # If TP=4, +# # If CONC > 32, then EP=4 +# # If CONC >= 256, DP_ATTN=true +# - { tp: 4, conc-start: 4, conc-end: 32 } +# - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } +# - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } +# # If TP=8, +# # If CONC > 8, then EP=8 +# # If CONC >= 256, DP_ATTN=true +# - { tp: 8, conc-start: 4, conc-end: 8 } +# - { tp: 8, ep: 8, conc-start: 16, conc-end: 128 } +# - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } +# - isl: 1024 +# osl: 8192 +# search-space: +# # If TP=4, +# # If CONC > 32, then EP=4 +# # If CONC >= 256, DP_ATTN=true +# - { tp: 4, conc-start: 4, conc-end: 32 } +# - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } +# - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } +# # If TP=8, +# # If CONC > 16, then EP=8 +# # If CONC >= 256, DP_ATTN=true +# - { tp: 8, conc-start: 4, conc-end: 16 } +# - { tp: 8, ep: 8, conc-start: 32, conc-end: 128 } +# - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } +# - isl: 8192 +# osl: 1024 +# search-space: +# # If TP=4, +# # If CONC > 32, then EP=4 and DP_ATTN=true +# - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } +# - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 } +# # If TP=8, +# # If CONC > 32, then EP=8 and DP_ATTN=true +# - { tp: 8, conc-start: 4, conc-end: 32 } +# - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } -dsr1-fp8-b200-sglang: - image: lmsysorg/sglang:v0.5.5-cu129-amd64 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: b200 - precision: fp8 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } +# dsr1-fp8-b200-sglang: +# image: lmsysorg/sglang:v0.5.5-cu129-amd64 +# model: deepseek-ai/DeepSeek-R1-0528 +# model-prefix: dsr1 +# runner: b200 +# precision: fp8 +# framework: sglang +# multinode: false +# seq-len-configs: +# - isl: 1024 +# osl: 1024 +# search-space: +# - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } +# - isl: 1024 +# osl: 8192 +# search-space: +# - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } +# - isl: 8192 +# osl: 1024 +# search-space: +# - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } -dsr1-fp8-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: b200-trt - precision: fp8 - framework: trt - multinode: false - seq-len-configs: - # For all sequence lengths, EP=TP - - isl: 1024 - osl: 1024 - search-space: - # If CONC > 32, then DP_ATTN=true - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - # If CONC > 64, then DP_ATTN=true - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - # If CONC > 64, then DP_ATTN=true - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } +# dsr1-fp8-b200-trt: +# image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 +# model: deepseek-ai/DeepSeek-R1-0528 +# model-prefix: dsr1 +# runner: b200-trt +# precision: fp8 +# framework: trt +# multinode: false +# seq-len-configs: +# # For all sequence lengths, EP=TP +# - isl: 1024 +# osl: 1024 +# search-space: +# # If CONC > 32, then DP_ATTN=true +# - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } +# - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } +# - isl: 1024 +# osl: 8192 +# search-space: +# # If CONC > 64, then DP_ATTN=true +# - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } +# - isl: 8192 +# osl: 1024 +# search-space: +# # If CONC > 64, then DP_ATTN=true +# - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } -dsr1-fp8-h200-sglang: - image: lmsysorg/sglang:v0.5.5-cu129-amd64 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: h200 - precision: fp8 - framework: sglang - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } +# dsr1-fp8-h200-sglang: +# image: lmsysorg/sglang:v0.5.5-cu129-amd64 +# model: deepseek-ai/DeepSeek-R1-0528 +# model-prefix: dsr1 +# runner: h200 +# precision: fp8 +# framework: sglang +# multinode: false +# seq-len-configs: +# - isl: 1024 +# osl: 1024 +# search-space: +# - { tp: 8, conc-start: 4, conc-end: 64 } +# - isl: 1024 +# osl: 8192 +# search-space: +# - { tp: 8, conc-start: 4, conc-end: 64 } +# - isl: 8192 +# osl: 1024 +# search-space: +# - { tp: 8, conc-start: 4, conc-end: 64 } -dsr1-fp8-h200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: h200 - precision: fp8 - framework: trt - multinode: false - # For all sequence lengths, EP=TP - seq-len-configs: - - isl: 1024 - osl: 1024 - # If CONC > 64, then DP_ATTN=true - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - # If CONC > 64, then DP_ATTN=true - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - # If CONC > 32, then DP_ATTN=true - search-space: - - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } +# dsr1-fp8-h200-trt: +# image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 +# model: deepseek-ai/DeepSeek-R1-0528 +# model-prefix: dsr1 +# runner: h200 +# precision: fp8 +# framework: trt +# multinode: false +# # For all sequence lengths, EP=TP +# seq-len-configs: +# - isl: 1024 +# osl: 1024 +# # If CONC > 64, then DP_ATTN=true +# search-space: +# - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } +# - isl: 1024 +# osl: 8192 +# # If CONC > 64, then DP_ATTN=true +# search-space: +# - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } +# - isl: 8192 +# osl: 1024 +# # If CONC > 32, then DP_ATTN=true +# search-space: +# - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } +# - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } -gptoss-fp4-b200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1 - model: openai/gpt-oss-120b - model-prefix: gptoss - runner: b200-trt - precision: fp4 - framework: trt - multinode: false - # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 64, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 8 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 64, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 16 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 64, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 128 } - - { tp: 8, conc-start: 4, conc-end: 8 } +# gptoss-fp4-b200-trt: +# image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1 +# model: openai/gpt-oss-120b +# model-prefix: gptoss +# runner: b200-trt +# precision: fp4 +# framework: trt +# multinode: false +# # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true +# seq-len-configs: +# - isl: 1024 +# osl: 1024 +# search-space: +# - { tp: 1, conc-start: 64, conc-end: 128 } +# - { tp: 2, conc-start: 4, conc-end: 128 } +# - { tp: 4, conc-start: 4, conc-end: 128 } +# - { tp: 8, conc-start: 4, conc-end: 8 } +# - isl: 1024 +# osl: 8192 +# search-space: +# - { tp: 1, conc-start: 64, conc-end: 128 } +# - { tp: 2, conc-start: 4, conc-end: 128 } +# - { tp: 4, conc-start: 4, conc-end: 128 } +# - { tp: 8, conc-start: 4, conc-end: 16 } +# - isl: 8192 +# osl: 1024 +# search-space: +# - { tp: 1, conc-start: 64, conc-end: 128 } +# - { tp: 2, conc-start: 4, conc-end: 128 } +# - { tp: 4, conc-start: 4, conc-end: 128 } +# - { tp: 8, conc-start: 4, conc-end: 8 } -gptoss-fp4-b200-vllm: - image: vllm/vllm-openai:v0.11.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - runner: b200 - precision: fp4 - framework: vllm - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 8 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 8 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 128 } - - { tp: 2, conc-start: 4, conc-end: 128 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 4 } +# gptoss-fp4-b200-vllm: +# image: vllm/vllm-openai:v0.11.0 +# model: openai/gpt-oss-120b +# model-prefix: gptoss +# runner: b200 +# precision: fp4 +# framework: vllm +# multinode: false +# seq-len-configs: +# - isl: 1024 +# osl: 1024 +# search-space: +# - { tp: 1, conc-start: 4, conc-end: 128 } +# - { tp: 2, conc-start: 4, conc-end: 128 } +# - { tp: 4, conc-start: 4, conc-end: 64 } +# - { tp: 8, conc-start: 4, conc-end: 8 } +# - isl: 1024 +# osl: 8192 +# search-space: +# - { tp: 1, conc-start: 4, conc-end: 128 } +# - { tp: 2, conc-start: 4, conc-end: 128 } +# - { tp: 4, conc-start: 4, conc-end: 64 } +# - { tp: 8, conc-start: 4, conc-end: 8 } +# - isl: 8192 +# osl: 1024 +# search-space: +# - { tp: 1, conc-start: 4, conc-end: 128 } +# - { tp: 2, conc-start: 4, conc-end: 128 } +# - { tp: 4, conc-start: 4, conc-end: 64 } +# - { tp: 8, conc-start: 4, conc-end: 4 } -gptoss-fp4-h100-vllm: - image: vllm/vllm-openai:v0.11.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - runner: h100 - precision: fp4 - framework: vllm - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 16 } +# gptoss-fp4-h100-vllm: +# image: vllm/vllm-openai:v0.11.0 +# model: openai/gpt-oss-120b +# model-prefix: gptoss +# runner: h100 +# precision: fp4 +# framework: vllm +# multinode: false +# seq-len-configs: +# - isl: 1024 +# osl: 1024 +# search-space: +# - { tp: 2, conc-start: 4, conc-end: 64 } +# - { tp: 4, conc-start: 4, conc-end: 64 } +# - { tp: 8, conc-start: 4, conc-end: 64 } +# - isl: 1024 +# osl: 8192 +# search-space: +# - { tp: 2, conc-start: 4, conc-end: 64 } +# - { tp: 4, conc-start: 4, conc-end: 64 } +# - { tp: 8, conc-start: 4, conc-end: 64 } +# - isl: 8192 +# osl: 1024 +# search-space: +# - { tp: 2, conc-start: 4, conc-end: 64 } +# - { tp: 4, conc-start: 4, conc-end: 64 } +# - { tp: 8, conc-start: 4, conc-end: 16 } -gptoss-fp4-h200-trt: - image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev - model: openai/gpt-oss-120b - model-prefix: gptoss - runner: h200 - precision: fp4 - framework: trt - multinode: false - # For all sequence lengths, EP=TP, DP_ATTENTION=false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 32 } - - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } - - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } +# gptoss-fp4-h200-trt: +# image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev +# model: openai/gpt-oss-120b +# model-prefix: gptoss +# runner: h200 +# precision: fp4 +# framework: trt +# multinode: false +# # For all sequence lengths, EP=TP, DP_ATTENTION=false +# seq-len-configs: +# - isl: 1024 +# osl: 1024 +# search-space: +# - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } +# - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } +# - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 32 } +# - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } +# - isl: 1024 +# osl: 8192 +# search-space: +# - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } +# - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } +# - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } +# - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } +# - isl: 8192 +# osl: 1024 +# search-space: +# - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } +# - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } +# - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } +# - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } -gptoss-fp4-h200-vllm: - image: vllm/vllm-openai:v0.11.0 - model: openai/gpt-oss-120b - model-prefix: gptoss - runner: h200 - precision: fp4 - framework: vllm - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 4 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 1024 - osl: 8192 - search-space: - - { tp: 1, conc-start: 4, conc-end: 4 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 1, conc-start: 4, conc-end: 64 } - - { tp: 2, conc-start: 4, conc-end: 64 } - - { tp: 4, conc-start: 4, conc-end: 64 } - - { tp: 8, conc-start: 4, conc-end: 32 } +# gptoss-fp4-h200-vllm: +# image: vllm/vllm-openai:v0.11.0 +# model: openai/gpt-oss-120b +# model-prefix: gptoss +# runner: h200 +# precision: fp4 +# framework: vllm +# multinode: false +# seq-len-configs: +# - isl: 1024 +# osl: 1024 +# search-space: +# - { tp: 1, conc-start: 4, conc-end: 4 } +# - { tp: 2, conc-start: 4, conc-end: 64 } +# - { tp: 4, conc-start: 4, conc-end: 64 } +# - { tp: 8, conc-start: 4, conc-end: 64 } +# - isl: 1024 +# osl: 8192 +# search-space: +# - { tp: 1, conc-start: 4, conc-end: 4 } +# - { tp: 2, conc-start: 4, conc-end: 64 } +# - { tp: 4, conc-start: 4, conc-end: 64 } +# - { tp: 8, conc-start: 4, conc-end: 64 } +# - isl: 8192 +# osl: 1024 +# search-space: +# - { tp: 1, conc-start: 4, conc-end: 64 } +# - { tp: 2, conc-start: 4, conc-end: 64 } +# - { tp: 4, conc-start: 4, conc-end: 64 } +# - { tp: 8, conc-start: 4, conc-end: 32 } dsr1-fp4-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 From da73311ec9d09a26a786a35c0e077bb0c5c0636a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 18:56:29 -0600 Subject: [PATCH 50/98] testing --- benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh index 4a67cb347..475f74e82 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -17,12 +17,15 @@ else DECODE_MTP_SIZE="0" fi +PERFORMANCE_SWEEPS_PATH="components/backends/trtllm/performance_sweeps" + echo "Cloning Dynamo repository..." git clone https://github.com/ai-dynamo/dynamo.git +cd dynamo git checkout release/0.5.1-rc0.20251105 git submodule update --init --recursive -cd dynamo/components/backends/trtllm/performance_sweeps +cd "$PERFORMANCE_SWEEPS_PATH" # Set up environment variables based on ISL/OSL if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then From 1227541670ef44cc53492f83601bab3644d1a001 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 19:23:46 -0600 Subject: [PATCH 51/98] adding testing workflows --- .github/workflows/e2e-tests.yml | 135 +++++++++++++++++++++++--------- 1 file changed, 98 insertions(+), 37 deletions(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 378e58848..db94a0ff6 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -24,10 +24,71 @@ jobs: CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py ${{ inputs.generate-cli-command }}) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - test-sweep: + # test-sweep: + # needs: get-jobs + # uses: ./.github/workflows/benchmark-tmpl.yml + # name: ${{ inputs.generate-cli-command }} + # strategy: + # fail-fast: false + # matrix: + # config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} + # secrets: inherit + # with: + # exp-name: ${{ matrix.config.exp-name }} + # isl: ${{ matrix.config.isl }} + # osl: ${{ matrix.config.osl }} + # max-model-len: ${{ matrix.config.max-model-len }} + # runner: ${{ matrix.config.runner }} + # image: ${{ matrix.config.image }} + # model: ${{ matrix.config.model }} + # framework: ${{ matrix.config.framework }} + # precision: ${{ matrix.config.precision }} + # tp: ${{ matrix.config.tp }} + # ep: ${{ matrix.config.ep }} + # dp-attn: ${{ matrix.config.dp-attn }} + # conc: ${{ matrix.config.conc }} + + test-sweep-multi-node: + needs: get-jobs + if: ${{ needs.get-jobs.outputs.search-space-config != '[]' && !fromJson(needs.get-jobs.outputs.search-space-config)[0].prefill}} + uses: ./.github/workflows/benchmark-multinode-tmpl.yml + name: ${{ inputs.generate-cli-command }} multi-node / + strategy: + fail-fast: false + matrix: + config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} + secrets: inherit + with: + isl: 1024 + osl: 1024 + max-model-len: 2048 + runner: ${{ matrix.config.runner }} + image: ${{ matrix.config.image }} + model: ${{ matrix.config.model }} + framework: ${{ matrix.config.framework }} + precision: ${{ matrix.config.precision }} + exp-name: ${{ matrix.config.exp-name }} + conc-list: ${{ toJson(matrix.config.conc) }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} + + prefill-num-worker: ${{ matrix.config.prefill.num-worker }} + prefill-tp: ${{ matrix.config.prefill.tp }} + prefill-ep: ${{ matrix.config.prefill.ep }} + prefill-dp-attn: ${{ matrix.config.prefill.dp-attn }} + prefill-additional-settings: ${{ toJson(matrix.config.prefill.additional-settings) }} + + decode-num-worker: ${{ matrix.config.decode.num-worker }} + decode-tp: ${{ matrix.config.decode.tp }} + decode-ep: ${{ matrix.config.decode.ep }} + decode-dp-attn: ${{ matrix.config.decode.dp-attn }} + decode-additional-settings: ${{ toJson(matrix.config.decode.additional-settings) }} + + test-sweep-single-node: needs: get-jobs + if: ${{ needs.get-jobs.outputs.search-space-config != '[]' && !fromJson(needs.get-jobs.outputs.search-space-config)[0].prefill }} uses: ./.github/workflows/benchmark-tmpl.yml - name: ${{ inputs.generate-cli-command }} + name: ${{ inputs.generate-cli-command }} single-node / strategy: fail-fast: false matrix: @@ -48,41 +109,41 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} - collect-results: - needs: test-sweep - if: ${{ always() }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit + # collect-results: + # needs: test-sweep + # if: ${{ always() }} + # uses: ./.github/workflows/collect-results.yml + # secrets: inherit - calc-success-rate: - needs: collect-results - if: ${{ always() }} - runs-on: ubuntu-latest + # calc-success-rate: + # needs: collect-results + # if: ${{ always() }} + # runs-on: ubuntu-latest - env: - RESULTS_DIR: "results/" - STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} + # env: + # RESULTS_DIR: "results/" + # STATS_FILENAME: "run_stats" + # GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - steps: - - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - - - name: Download results artifacts - uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 - with: - path: ${{ env.RESULTS_DIR }} - pattern: results_* - - - name: Install python dependencies - run: pip install PyGithub - - - name: Calculate success rate - run: python3 utils/calc_success_rate.py $STATS_FILENAME - - - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 - with: - name: "run-stats" - path: ${{ env.STATS_FILENAME }}.json + # steps: + # - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 + # with: + # token: ${{ secrets.REPO_PAT }} + # fetch-depth: 0 + + # - name: Download results artifacts + # uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 + # with: + # path: ${{ env.RESULTS_DIR }} + # pattern: results_* + + # - name: Install python dependencies + # run: pip install PyGithub + + # - name: Calculate success rate + # run: python3 utils/calc_success_rate.py $STATS_FILENAME + + # - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 + # with: + # name: "run-stats" + # path: ${{ env.STATS_FILENAME }}.json From 0828c3ec66e1f51c9cb1553782dfbd3ef0380865 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 19:26:43 -0600 Subject: [PATCH 52/98] adding testing workflows --- .github/workflows/e2e-tests.yml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index db94a0ff6..308da5e13 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -22,8 +22,20 @@ jobs: run: | pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py ${{ inputs.generate-cli-command }}) + echo "${{fromJson(needs.get-jobs.outputs.search-space-config) }}" echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT + test: + runs-on: ubuntu-latest + needs: get-jobs + steps: + - id: get-jobs + run: | + set -x + echo "${{fromJson(needs.get-jobs.outputs.search-space-config) }}" + echo "${{fromJson(needs.get-jobs.outputs.search-space-config)[0] }}" + echo "${{fromJson(needs.get-jobs.outputs.search-space-config)[0].prefill }}" + # test-sweep: # needs: get-jobs # uses: ./.github/workflows/benchmark-tmpl.yml From 4c0dc18b79543c6042bf6b16ed86b711639225e4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 19:27:23 -0600 Subject: [PATCH 53/98] adding testing workflows --- .github/workflows/e2e-tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 308da5e13..734d56276 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -22,7 +22,6 @@ jobs: run: | pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py ${{ inputs.generate-cli-command }}) - echo "${{fromJson(needs.get-jobs.outputs.search-space-config) }}" echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT test: From 0a8a6c67ad4802307baa54e8a6e111ea7d4b5cc0 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 19:32:56 -0600 Subject: [PATCH 54/98] adding testing workflows --- .github/workflows/e2e-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 734d56276..56d9f7999 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -61,7 +61,7 @@ jobs: test-sweep-multi-node: needs: get-jobs - if: ${{ needs.get-jobs.outputs.search-space-config != '[]' && !fromJson(needs.get-jobs.outputs.search-space-config)[0].prefill}} + if: ${{ needs.get-jobs.outputs.search-space-config != '[]' && fromJson(needs.get-jobs.outputs.search-space-config)[0].prefill != null }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml name: ${{ inputs.generate-cli-command }} multi-node / strategy: @@ -97,7 +97,7 @@ jobs: test-sweep-single-node: needs: get-jobs - if: ${{ needs.get-jobs.outputs.search-space-config != '[]' && !fromJson(needs.get-jobs.outputs.search-space-config)[0].prefill }} + if: ${{ needs.get-jobs.outputs.search-space-config != '[]' && fromJson(needs.get-jobs.outputs.search-space-config)[0].prefill == null }} uses: ./.github/workflows/benchmark-tmpl.yml name: ${{ inputs.generate-cli-command }} single-node / strategy: From 4e98fe41a763fc46cb8781cf1a414103580bbbff Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 19:42:31 -0600 Subject: [PATCH 55/98] adding testing workflows --- .github/workflows/e2e-tests.yml | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 56d9f7999..bf65a0a76 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -21,7 +21,10 @@ jobs: - id: get-jobs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py ${{ inputs.generate-cli-command }}) + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py\ + --runner-config .github/configs/runners.yaml \ + --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml \ + ${{ inputs.generate-cli-command }}) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT test: @@ -63,7 +66,7 @@ jobs: needs: get-jobs if: ${{ needs.get-jobs.outputs.search-space-config != '[]' && fromJson(needs.get-jobs.outputs.search-space-config)[0].prefill != null }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: ${{ inputs.generate-cli-command }} multi-node / + name: ${{ inputs.generate-cli-command }} multi-node strategy: fail-fast: false matrix: @@ -99,7 +102,7 @@ jobs: needs: get-jobs if: ${{ needs.get-jobs.outputs.search-space-config != '[]' && fromJson(needs.get-jobs.outputs.search-space-config)[0].prefill == null }} uses: ./.github/workflows/benchmark-tmpl.yml - name: ${{ inputs.generate-cli-command }} single-node / + name: ${{ inputs.generate-cli-command }} single-node strategy: fail-fast: false matrix: From 7826b982f8782ddaa5b9289c53774c7742b74bb8 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 19:43:33 -0600 Subject: [PATCH 56/98] adding testing workflows --- .github/workflows/e2e-tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index bf65a0a76..1b8a8dafe 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -21,7 +21,7 @@ jobs: - id: get-jobs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py\ + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py \ --runner-config .github/configs/runners.yaml \ --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml \ ${{ inputs.generate-cli-command }}) From 3c1ce68ef0ddb3342dddb7daa8be1c0537d4716a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 19:48:39 -0600 Subject: [PATCH 57/98] adding testing workflows --- .github/workflows/e2e-tests.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 1b8a8dafe..11a21f71e 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -22,9 +22,9 @@ jobs: run: | pip install pydantic CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py \ + ${{ inputs.generate-cli-command }} \ --runner-config .github/configs/runners.yaml \ - --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml \ - ${{ inputs.generate-cli-command }}) + --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT test: From 15eda164ccec128cfe4b7402e4931c230cb3bacf Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 19:57:14 -0600 Subject: [PATCH 58/98] adding testing workflows --- .github/configs/nvidia-master.yaml | 912 +++++++++--------- .github/workflows/gb200-tests.yml | 91 -- benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh | 6 +- 3 files changed, 450 insertions(+), 559 deletions(-) delete mode 100644 .github/workflows/gb200-tests.yml diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 2dd8711df..9cae7ea11 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -339,433 +339,414 @@ dsr1-fp4-gb200-dynamo-trt: - isl: 1024 osl: 1024 search-space: - # MTP configurations - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - # NOTE: Prefill tp and ep are always 4 because each GB200 node has 4 GPUs and - # ctx_tp_size is hardcoded to 4 in launch_gb200-nv.sh. Decode tp/ep matches gen_tp_size. - # For 1k/1k: prefill batch-size=4, max-num-tokens=4608 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8, 16, 36 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_MAX_NUM_TOKENS=128" - - "DECODE_MAX_BATCH_SIZE=32" - - "DECODE_GPU_MEM_FRACTION=0.9" - - "DECODE_MTP_SIZE=3" - - "DECODE_EPLB_NUM_SLOTS=0" + # # MTP configurations + # # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + # # NOTE: Prefill tp and ep are always 4 because each GB200 node has 4 GPUs and + # # ctx_tp_size is hardcoded to 4 in launch_gb200-nv.sh. Decode tp/ep matches gen_tp_size. + # # For 1k/1k: prefill batch-size=4, max-num-tokens=4608 + # - spec-decoding: "mtp" + # conc-list: [ 1, 2, 4, 8, 16, 36 ] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 4 + # dp-attn: false + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=4608" + # - "PREFILL_MAX_BATCH_SIZE=4" + # decode: + # num-worker: 4 + # tp: 8 + # ep: 8 + # dp-attn: false + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=128" + # - "DECODE_MAX_BATCH_SIZE=32" + # - "DECODE_GPU_MEM_FRACTION=0.9" + # - "DECODE_MTP_SIZE=3" - # dep - Run Data-Expert Parallel mode (attention_dp=true) - - spec-decoding: "mtp" - conc-list: [ 512, 1075 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=64" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=3" - - "DECODE_EPLB_NUM_SLOTS=0" + # # dep - Run Data-Expert Parallel mode (attention_dp=true) + # - spec-decoding: "mtp" + # conc-list: [ 512, 1075 ] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=4608" + # - "PREFILL_MAX_BATCH_SIZE=4" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=256" + # - "DECODE_MAX_BATCH_SIZE=64" + # - "DECODE_GPU_MEM_FRACTION=0.7" + # - "DECODE_MTP_SIZE=3" - - spec-decoding: "mtp" - conc-list: [ 2150 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=128" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=1" - - "DECODE_EPLB_NUM_SLOTS=0" + # - spec-decoding: "mtp" + # conc-list: [ 2150 ] + # prefill: + # num-worker: 2 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=4608" + # - "PREFILL_MAX_BATCH_SIZE=4" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=256" + # - "DECODE_MAX_BATCH_SIZE=128" + # - "DECODE_GPU_MEM_FRACTION=0.7" + # - "DECODE_MTP_SIZE=1" - - spec-decoding: "mtp" - conc-list: [ 512 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=16" - - "DECODE_GPU_MEM_FRACTION=0.6" - - "DECODE_MTP_SIZE=3" - - "DECODE_EPLB_NUM_SLOTS=0" + # - spec-decoding: "mtp" + # conc-list: [ 512 ] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=4608" + # - "PREFILL_MAX_BATCH_SIZE=4" + # decode: + # num-worker: 1 + # tp: 32 + # ep: 32 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=64" + # - "DECODE_MAX_BATCH_SIZE=16" + # - "DECODE_GPU_MEM_FRACTION=0.6" + # - "DECODE_MTP_SIZE=3" - - spec-decoding: "mtp" - conc-list: [ 2252 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=512" - - "DECODE_MAX_BATCH_SIZE=256" - - "DECODE_GPU_MEM_FRACTION=0.8" - - "DECODE_MTP_SIZE=1" - - "DECODE_EPLB_NUM_SLOTS=0" + # - spec-decoding: "mtp" + # conc-list: [ 2252 ] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=4608" + # - "PREFILL_MAX_BATCH_SIZE=4" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=512" + # - "DECODE_MAX_BATCH_SIZE=256" + # - "DECODE_GPU_MEM_FRACTION=0.8" + # - "DECODE_MTP_SIZE=1" - # Non-MTP configurations (default spec_decoding="none") - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 141 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_MAX_NUM_TOKENS=128" - - "DECODE_MAX_BATCH_SIZE=128" - - "DECODE_GPU_MEM_FRACTION=0.9" - - "DECODE_MTP_SIZE=0" - - "DECODE_EPLB_NUM_SLOTS=0" + # # Non-MTP configurations (default spec_decoding="none") + # # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + # - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 141 ] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 4 + # dp-attn: false + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=4608" + # - "PREFILL_MAX_BATCH_SIZE=4" + # decode: + # num-worker: 4 + # tp: 8 + # ep: 8 + # dp-attn: false + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=128" + # - "DECODE_MAX_BATCH_SIZE=128" + # - "DECODE_GPU_MEM_FRACTION=0.9" + # - "DECODE_MTP_SIZE=0" - # dep - Run Data-Expert Parallel mode (attention_dp=true) - - conc-list: [ 1075 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=32" - - "DECODE_MAX_BATCH_SIZE=32" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=0" - - "DECODE_EPLB_NUM_SLOTS=0" + # # dep - Run Data-Expert Parallel mode (attention_dp=true) + # - conc-list: [ 1075 ] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=4608" + # - "PREFILL_MAX_BATCH_SIZE=4" + # decode: + # num-worker: 1 + # tp: 32 + # ep: 32 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=32" + # - "DECODE_MAX_BATCH_SIZE=32" + # - "DECODE_GPU_MEM_FRACTION=0.7" + # - "DECODE_MTP_SIZE=0" - - conc-list: [ 1075 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=64" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=0" - - "DECODE_EPLB_NUM_SLOTS=0" + # - conc-list: [ 1075 ] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=4608" + # - "PREFILL_MAX_BATCH_SIZE=4" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=64" + # - "DECODE_MAX_BATCH_SIZE=64" + # - "DECODE_GPU_MEM_FRACTION=0.75" + # - "DECODE_MTP_SIZE=0" - - conc-list: [ 2048, 4300 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=256" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=0" - - "DECODE_EPLB_NUM_SLOTS=0" + # - conc-list: [ 2048, 4300 ] + # prefill: + # num-worker: 2 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=4608" + # - "PREFILL_MAX_BATCH_SIZE=4" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=256" + # - "DECODE_MAX_BATCH_SIZE=256" + # - "DECODE_GPU_MEM_FRACTION=0.75" + # - "DECODE_MTP_SIZE=0" - - conc-list: [ 4300 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=512" - - "DECODE_MAX_BATCH_SIZE=512" - - "DECODE_GPU_MEM_FRACTION=0.8" - - "DECODE_MTP_SIZE=0" - - "DECODE_EPLB_NUM_SLOTS=0" + # - conc-list: [ 4300 ] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=4608" + # - "PREFILL_MAX_BATCH_SIZE=4" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=512" + # - "DECODE_MAX_BATCH_SIZE=512" + # - "DECODE_GPU_MEM_FRACTION=0.8" + # - "DECODE_MTP_SIZE=0" - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations (spec_decoding="mtp") - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - # For 8k/1k: prefill batch-size=1, max-num-tokens=8448 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8, 18 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=16" - - "DECODE_GPU_MEM_FRACTION=0.9" - - "DECODE_MTP_SIZE=3" - - "DECODE_EPLB_NUM_SLOTS=0" + # - isl: 8192 + # osl: 1024 + # search-space: + # # MTP configurations (spec_decoding="mtp") + # # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + # # For 8k/1k: prefill batch-size=1, max-num-tokens=8448 + # - spec-decoding: "mtp" + # conc-list: [ 1, 2, 4, 8, 18 ] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 4 + # dp-attn: false + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=8448" + # - "PREFILL_MAX_BATCH_SIZE=1" + # decode: + # num-worker: 3 + # tp: 8 + # ep: 8 + # dp-attn: false + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=64" + # - "DECODE_MAX_BATCH_SIZE=16" + # - "DECODE_GPU_MEM_FRACTION=0.9" + # - "DECODE_MTP_SIZE=3" - # dep - Run Data-Expert Parallel mode (attention_dp=true) - - spec-decoding: "mtp" - conc-list: [ 128, 269 ] - prefill: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=32" - - "DECODE_MAX_BATCH_SIZE=8" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=3" - - "DECODE_EPLB_NUM_SLOTS=0" + # # dep - Run Data-Expert Parallel mode (attention_dp=true) + # - spec-decoding: "mtp" + # conc-list: [ 128, 269 ] + # prefill: + # num-worker: 5 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=8448" + # - "PREFILL_MAX_BATCH_SIZE=1" + # decode: + # num-worker: 1 + # tp: 32 + # ep: 32 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=32" + # - "DECODE_MAX_BATCH_SIZE=8" + # - "DECODE_GPU_MEM_FRACTION=0.7" + # - "DECODE_MTP_SIZE=3" - - spec-decoding: "mtp" - conc-list: [ 538 ] - prefill: - num-worker: 8 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=16" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=3" - - "DECODE_EPLB_NUM_SLOTS=0" + # - spec-decoding: "mtp" + # conc-list: [ 538 ] + # prefill: + # num-worker: 8 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=8448" + # - "PREFILL_MAX_BATCH_SIZE=1" + # decode: + # num-worker: 1 + # tp: 32 + # ep: 32 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=64" + # - "DECODE_MAX_BATCH_SIZE=16" + # - "DECODE_GPU_MEM_FRACTION=0.7" + # - "DECODE_MTP_SIZE=3" - - spec-decoding: "mtp" - conc-list: [ 1075 ] - prefill: - num-worker: 8 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=64" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=2" - - "DECODE_EPLB_NUM_SLOTS=0" + # - spec-decoding: "mtp" + # conc-list: [ 1075 ] + # prefill: + # num-worker: 8 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=8448" + # - "PREFILL_MAX_BATCH_SIZE=1" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=256" + # - "DECODE_MAX_BATCH_SIZE=64" + # - "DECODE_GPU_MEM_FRACTION=0.75" + # - "DECODE_MTP_SIZE=2" - - spec-decoding: "mtp" - conc-list: [ 2150 ] - prefill: - num-worker: 6 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=512" - - "DECODE_MAX_BATCH_SIZE=256" - - "DECODE_GPU_MEM_FRACTION=0.8" - - "DECODE_MTP_SIZE=1" - - "DECODE_EPLB_NUM_SLOTS=0" + # - spec-decoding: "mtp" + # conc-list: [ 2150 ] + # prefill: + # num-worker: 6 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=8448" + # - "PREFILL_MAX_BATCH_SIZE=1" + # decode: + # num-worker: 1 + # tp: 8 + # ep: 8 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=512" + # - "DECODE_MAX_BATCH_SIZE=256" + # - "DECODE_GPU_MEM_FRACTION=0.8" + # - "DECODE_MTP_SIZE=1" - # Non-MTP configurations (default spec_decoding="none") - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - - conc-list: [ 1, 2, 4, 8, 16, 34 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_MAX_NUM_TOKENS=32" - - "DECODE_MAX_BATCH_SIZE=32" - - "DECODE_GPU_MEM_FRACTION=0.9" - - "DECODE_MTP_SIZE=0" - - "DECODE_EPLB_NUM_SLOTS=0" + # # Non-MTP configurations (default spec_decoding="none") + # # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + # - conc-list: [ 1, 2, 4, 8, 16, 34 ] + # prefill: + # num-worker: 1 + # tp: 4 + # ep: 4 + # dp-attn: false + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=8448" + # - "PREFILL_MAX_BATCH_SIZE=1" + # decode: + # num-worker: 3 + # tp: 8 + # ep: 8 + # dp-attn: false + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=32" + # - "DECODE_MAX_BATCH_SIZE=32" + # - "DECODE_GPU_MEM_FRACTION=0.9" + # - "DECODE_MTP_SIZE=0" - # dep - Run Data-Expert Parallel mode (attention_dp=true) - - conc-list: [ 256, 538 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=16" - - "DECODE_MAX_BATCH_SIZE=16" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=0" - - "DECODE_EPLB_NUM_SLOTS=0" + # # dep - Run Data-Expert Parallel mode (attention_dp=true) + # - conc-list: [ 256, 538 ] + # prefill: + # num-worker: 4 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=8448" + # - "PREFILL_MAX_BATCH_SIZE=1" + # decode: + # num-worker: 1 + # tp: 32 + # ep: 32 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=16" + # - "DECODE_MAX_BATCH_SIZE=16" + # - "DECODE_GPU_MEM_FRACTION=0.7" + # - "DECODE_MTP_SIZE=0" - - conc-list: [ 1075 ] - prefill: - num-worker: 6 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=64" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=0" - - "DECODE_EPLB_NUM_SLOTS=0" + # - conc-list: [ 1075 ] + # prefill: + # num-worker: 6 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=8448" + # - "PREFILL_MAX_BATCH_SIZE=1" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=64" + # - "DECODE_MAX_BATCH_SIZE=64" + # - "DECODE_GPU_MEM_FRACTION=0.75" + # - "DECODE_MTP_SIZE=0" - - conc-list: [ 2150 ] - prefill: - num-worker: 8 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=128" - - "DECODE_MAX_BATCH_SIZE=128" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=0" - - "DECODE_EPLB_NUM_SLOTS=0" + # - conc-list: [ 2150 ] + # prefill: + # num-worker: 8 + # tp: 4 + # ep: 4 + # dp-attn: true + # additional-settings: + # - "PREFILL_MAX_NUM_TOKENS=8448" + # - "PREFILL_MAX_BATCH_SIZE=1" + # decode: + # num-worker: 1 + # tp: 16 + # ep: 16 + # dp-attn: true + # additional-settings: + # - "DECODE_MAX_NUM_TOKENS=128" + # - "DECODE_MAX_BATCH_SIZE=128" + # - "DECODE_GPU_MEM_FRACTION=0.75" + # - "DECODE_MTP_SIZE=0" - conc-list: [ 2150 ] prefill: @@ -786,61 +767,60 @@ dsr1-fp4-gb200-dynamo-trt: - "DECODE_MAX_BATCH_SIZE=256" - "DECODE_GPU_MEM_FRACTION=0.8" - "DECODE_MTP_SIZE=0" - - "DECODE_EPLB_NUM_SLOTS=0" -dsr1-fp8-gb200-dynamo-sglang: - image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: gb200 - precision: fp8 - framework: dynamo-sglang - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - spec-decoding: "none" - # conc-list: [ 1024, 2048, 4096, 4608, 4864, 4992, 5120, 5376, 5632, 6144, 8192 ] - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 3 - # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: - # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "PREFILL_NODES=6" - - "N_ADDITIONAL_FRONTENDS=8" - decode: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=12" - - isl: 8192 - osl: 1024 - search-space: - - spec-decoding: "none" - # conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] - conc-list: [ 128 ] - prefill: - num-worker: 6 - # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: - # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "PREFILL_NODES=12" - - "N_ADDITIONAL_FRONTENDS=8" - decode: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=6" +# dsr1-fp8-gb200-dynamo-sglang: +# image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 +# model: deepseek-ai/DeepSeek-R1-0528 +# model-prefix: dsr1 +# runner: gb200 +# precision: fp8 +# framework: dynamo-sglang +# multinode: true +# disagg: true +# seq-len-configs: +# - isl: 1024 +# osl: 1024 +# search-space: +# - spec-decoding: "none" +# # conc-list: [ 1024, 2048, 4096, 4608, 4864, 4992, 5120, 5376, 5632, 6144, 8192 ] +# conc-list: [ 1024, 2048 ] +# prefill: +# num-worker: 3 +# # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: +# # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh +# tp: 1 +# ep: 1 +# dp-attn: true +# additional-settings: +# - "PREFILL_NODES=6" +# - "N_ADDITIONAL_FRONTENDS=8" +# decode: +# num-worker: 1 +# tp: 1 +# ep: 1 +# dp-attn: true +# additional-settings: +# - "DECODE_NODES=12" +# - isl: 8192 +# osl: 1024 +# search-space: +# - spec-decoding: "none" +# # conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] +# conc-list: [ 128 ] +# prefill: +# num-worker: 6 +# # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: +# # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh +# tp: 1 +# ep: 1 +# dp-attn: true +# additional-settings: +# - "PREFILL_NODES=12" +# - "N_ADDITIONAL_FRONTENDS=8" +# decode: +# num-worker: 1 +# tp: 1 +# ep: 1 +# dp-attn: true +# additional-settings: +# - "DECODE_NODES=6" diff --git a/.github/workflows/gb200-tests.yml b/.github/workflows/gb200-tests.yml deleted file mode 100644 index c700599d9..000000000 --- a/.github/workflows/gb200-tests.yml +++ /dev/null @@ -1,91 +0,0 @@ -name: GB200 Tests - -on: - workflow_dispatch: - inputs: - image: - description: "Serving Image" - required: true - type: choice - options: - - "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1" - - "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3" - - model: - description: "Model" - required: true - type: choice - options: - - "deepseek-ai/DeepSeek-R1-0528" - - "deepseek-r1-fp4" - - precision: - description: "Precision" - required: true - type: choice - options: - - "fp4" - - "fp8" - - framework: - description: "Framework" - required: true - type: choice - options: - - "dynamo-trtllm" - - "dynamo-sglang" - - mtp: - description: "Mtp On/Off" - required: true - type: choice - options: - - "on" - - "off" - - isl: - description: "ISL" - required: true - type: string - - osl: - description: "OSL" - required: true - type: string - -jobs: - pre-run: - runs-on: ubuntu-latest - outputs: - max-model-len: ${{ steps.calc.outputs.max-model-len }} - steps: - - id: calc - shell: python - run: | - import os - import sys - try: - isl = int("${{ inputs.isl }}") - osl = int("${{ inputs.osl }}") - except ValueError: - print("Error: ISL and OSL must be integers") - sys.exit(1) - with open(os.environ['GITHUB_OUTPUT'], 'a') as f: - f.write(f"max-model-len={isl + osl}\n") - - benchmark-gb200: - needs: pre-run - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 test - secrets: inherit - with: - runner: gb200 - image: ${{ inputs.image }} - model: ${{ inputs.model }} - framework: ${{ inputs.framework }} - precision: ${{ inputs.precision }} - exp-name: dsr1_1k1k - isl: ${{ inputs.isl }} - osl: ${{ inputs.osl }} - max-model-len: ${{ needs.pre-run.outputs.max-model-len }} - mtp-mode: ${{ inputs.mtp }} diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh index 475f74e82..51037c58f 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -8,7 +8,7 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING \ PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ PREFILL_MAX_NUM_TOKENS PREFILL_MAX_BATCH_SIZE DECODE_MAX_NUM_TOKENS \ - DECODE_MAX_BATCH_SIZE DECODE_GPU_MEM_FRACTION DECODE_EPLB_NUM_SLOTS \ + DECODE_MAX_BATCH_SIZE DECODE_GPU_MEM_FRACTION \ NTASKS_PER_NODE if [ "$SPEC_DECODING" == "mtp" ]; then @@ -44,6 +44,8 @@ gen_nodes=$(((DECODE_TP + 3)/4 * DECODE_NUM_WORKERS)) total_nodes=$((PREFILL_NUM_WORKERS + gen_nodes)) total_tasks=$((total_nodes * NTASKS_PER_NODE)) +decode_eplb_num_slots=0 + sbatch --nodes=${total_nodes} \ --ntasks=${total_tasks} \ --ntasks-per-node=${NTASKS_PER_NODE} \ @@ -54,7 +56,7 @@ sbatch --nodes=${total_nodes} \ ${PREFILL_DP_ATTN} ${DECODE_NUM_WORKERS} \ ${DECODE_TP} ${DECODE_MAX_BATCH_SIZE} \ ${DECODE_MAX_NUM_TOKENS} ${DECODE_DP_ATTN} \ - ${DECODE_GPU_MEM_FRACTION} ${DECODE_EPLB_NUM_SLOTS} \ + ${DECODE_GPU_MEM_FRACTION} ${decode_eplb_num_slots} \ ${DECODE_MTP_SIZE} "${CONC_LIST}" \ ${gen_nodes} ${kind} \ ${MODEL_PATH} ${SERVED_MODEL_NAME} \ From ee065daf7af77b59e13bf473f6338c02fb55004d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 21:06:27 -0600 Subject: [PATCH 59/98] adding testing workflows --- .github/workflows/e2e-tests.yml | 114 +++++++----------- .../workflows/full-sweep-1k1k-scheduler.yml | 4 +- utils/summarize.py | 62 +++++----- 3 files changed, 77 insertions(+), 103 deletions(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 11a21f71e..127db58ec 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -27,43 +27,11 @@ jobs: --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml) echo "search-space-config=$CONFIG_JSON" >> $GITHUB_OUTPUT - test: - runs-on: ubuntu-latest - needs: get-jobs - steps: - - id: get-jobs - run: | - set -x - echo "${{fromJson(needs.get-jobs.outputs.search-space-config) }}" - echo "${{fromJson(needs.get-jobs.outputs.search-space-config)[0] }}" - echo "${{fromJson(needs.get-jobs.outputs.search-space-config)[0].prefill }}" - - # test-sweep: - # needs: get-jobs - # uses: ./.github/workflows/benchmark-tmpl.yml - # name: ${{ inputs.generate-cli-command }} - # strategy: - # fail-fast: false - # matrix: - # config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} - # secrets: inherit - # with: - # exp-name: ${{ matrix.config.exp-name }} - # isl: ${{ matrix.config.isl }} - # osl: ${{ matrix.config.osl }} - # max-model-len: ${{ matrix.config.max-model-len }} - # runner: ${{ matrix.config.runner }} - # image: ${{ matrix.config.image }} - # model: ${{ matrix.config.model }} - # framework: ${{ matrix.config.framework }} - # precision: ${{ matrix.config.precision }} - # tp: ${{ matrix.config.tp }} - # ep: ${{ matrix.config.ep }} - # dp-attn: ${{ matrix.config.dp-attn }} - # conc: ${{ matrix.config.conc }} - test-sweep-multi-node: needs: get-jobs + # Use existence (or non-existence) of 'prefill' field as a proxy to determined multi-node tests. + # We only need to check the first entry because by design, all entries in the matrix will be of the same type (there will + # never be a mix of multi-node and single-node entries as output from generate_sweep_configs.py). if: ${{ needs.get-jobs.outputs.search-space-config != '[]' && fromJson(needs.get-jobs.outputs.search-space-config)[0].prefill != null }} uses: ./.github/workflows/benchmark-multinode-tmpl.yml name: ${{ inputs.generate-cli-command }} multi-node @@ -73,9 +41,9 @@ jobs: config: ${{ fromJson(needs.get-jobs.outputs.search-space-config) }} secrets: inherit with: - isl: 1024 - osl: 1024 - max-model-len: 2048 + isl: ${{ matrix.config.isl }} + osl: ${{ matrix.config.osl }} + max-model-len: ${{ matrix.config.max-model-len }} runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -123,41 +91,41 @@ jobs: dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} - # collect-results: - # needs: test-sweep - # if: ${{ always() }} - # uses: ./.github/workflows/collect-results.yml - # secrets: inherit - - # calc-success-rate: - # needs: collect-results - # if: ${{ always() }} - # runs-on: ubuntu-latest - - # env: - # RESULTS_DIR: "results/" - # STATS_FILENAME: "run_stats" - # GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - - # steps: - # - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - # with: - # token: ${{ secrets.REPO_PAT }} - # fetch-depth: 0 - - # - name: Download results artifacts - # uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 - # with: - # path: ${{ env.RESULTS_DIR }} - # pattern: results_* + collect-results: + needs: [test-sweep-multi-node, test-sweep-single-node] + if: ${{ always() }} + uses: ./.github/workflows/collect-results.yml + secrets: inherit - # - name: Install python dependencies - # run: pip install PyGithub + calc-success-rate: + needs: collect-results + if: ${{ always() }} + runs-on: ubuntu-latest - # - name: Calculate success rate - # run: python3 utils/calc_success_rate.py $STATS_FILENAME + env: + RESULTS_DIR: "results/" + STATS_FILENAME: "run_stats" + GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - # - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 - # with: - # name: "run-stats" - # path: ${{ env.STATS_FILENAME }}.json + steps: + - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 + with: + token: ${{ secrets.REPO_PAT }} + fetch-depth: 0 + + - name: Download results artifacts + uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 + with: + path: ${{ env.RESULTS_DIR }} + pattern: results_* + + - name: Install python dependencies + run: pip install PyGithub + + - name: Calculate success rate + run: python3 utils/calc_success_rate.py $STATS_FILENAME + + - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 + with: + name: "run-stats" + path: ${{ env.STATS_FILENAME }}.json diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 11af50c48..db8da19fd 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -167,7 +167,7 @@ jobs: benchmark-dsr1-single-node, benchmark-dsr1-multi-node, ] - if: ${{ always() && needs.get-dsr1-configs.result == 'success' }} + if: ${{ always() && needs.get-dsr1-configs.result == 'success' && needs.get-dsr1-configs.outputs.single-node-search-space-config != '[]' && needs.get-dsr1-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: @@ -180,7 +180,7 @@ jobs: benchmark-gptoss-single-node, benchmark-gptoss-multi-node, ] - if: ${{ always() && needs.get-gptoss-configs.result == 'success' }} + if: ${{ always() && needs.get-gptoss-configs.result == 'success' && needs.get-gptoss-configs.outputs.single-node-search-space-config != '[]' && needs.get-gptoss-configs.outputs.multi-node-search-space-config != '[]' }} uses: ./.github/workflows/collect-results.yml secrets: inherit with: diff --git a/utils/summarize.py b/utils/summarize.py index 93035b741..75cf6610b 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -10,18 +10,20 @@ result = json.load(f) results.append(result) -is_multinode = results[0]['is_multinode'] +single_node_results = [r for r in results if not r['is_multinode']] +multinode_results = [r for r in results if r['is_multinode']] -if is_multinode: - results.sort(key=lambda r: (r['model'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc'])) +if single_node_results: + single_node_results.sort(key=lambda r: (r['model'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'])) - summary_header = '''\ -| Model | Hardware | Framework | Precision | ISL | OSL | Prefill TP | Prefill EP | Prefill DP Attn | Prefill Workers | Prefill GPUs | Decode TP | Decode EP | Decode DP Attn | Decode Workers | Decode GPUs | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | -| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ + print("## Single-Node Results\n") + single_node_header = '''\ +| Model | Hardware | Framework | Precision | ISL | OSL | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | +| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' - print(summary_header) + print(single_node_header) - for result in results: + for result in single_node_results: print( f"| {result['model']} " f"| {result['hw'].upper()} " @@ -29,16 +31,9 @@ f"| {result['precision'].upper()} " f"| {result['isl']} " f"| {result['osl']} " - f"| {result['prefill_tp']} " - f"| {result['prefill_ep']} " - f"| {result['prefill_dp_attention']} " - f"| {result['prefill_num_workers']} " - f"| {result['num_prefill_gpu']} " - f"| {result['decode_tp']} " - f"| {result['decode_ep']} " - f"| {result['decode_dp_attention']} " - f"| {result['decode_num_workers']} " - f"| {result['num_decode_gpu']} " + f"| {result['tp']} " + f"| {result['ep']} " + f"| {result['dp_attention']} " f"| {result['conc']} " f"| {(result['median_ttft'] * 1000):.4f} " f"| {(result['median_tpot'] * 1000):.4f} " @@ -48,16 +43,20 @@ f"| {result['output_tput_per_gpu']:.4f} " f"| {result['input_tput_per_gpu']:.4f} |" ) -else: - results.sort(key=lambda r: (r['model'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'])) + + print("\n") - summary_header = '''\ -| Model | Hardware | Framework | Precision | ISL | OSL | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | -| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ +if multinode_results: + multinode_results.sort(key=lambda r: (r['model'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc'])) + + print("## Multi-Node Results\n") + multinode_header = '''\ +| Model | Hardware | Framework | Precision | ISL | OSL | Prefill TP | Prefill EP | Prefill DP Attn | Prefill Workers | Prefill GPUs | Decode TP | Decode EP | Decode DP Attn | Decode Workers | Decode GPUs | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | +| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' - print(summary_header) + print(multinode_header) - for result in results: + for result in multinode_results: print( f"| {result['model']} " f"| {result['hw'].upper()} " @@ -65,9 +64,16 @@ f"| {result['precision'].upper()} " f"| {result['isl']} " f"| {result['osl']} " - f"| {result['tp']} " - f"| {result['ep']} " - f"| {result['dp_attention']} " + f"| {result['prefill_tp']} " + f"| {result['prefill_ep']} " + f"| {result['prefill_dp_attention']} " + f"| {result['prefill_num_workers']} " + f"| {result['num_prefill_gpu']} " + f"| {result['decode_tp']} " + f"| {result['decode_ep']} " + f"| {result['decode_dp_attention']} " + f"| {result['decode_num_workers']} " + f"| {result['num_decode_gpu']} " f"| {result['conc']} " f"| {(result['median_ttft'] * 1000):.4f} " f"| {(result['median_tpot'] * 1000):.4f} " From cf583ae4e2650989b7cf87cf602f6944e3cdefa3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 21:13:38 -0600 Subject: [PATCH 60/98] adding testing workflows --- .github/workflows/collect-results.yml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index 8c0487320..b37d789a0 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -40,10 +40,6 @@ jobs: name: results_${{ inputs.exp-name || 'all' }} path: agg_${{ inputs.exp-name || 'all' }}.json - - name: Plot performance - run: | - pip install -q matplotlib - python3 utils/plot_perf.py results/ ${{ inputs.exp-name || 'all' }} - name: Upload performance graphs uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 with: From 17c8fcb7fe01a659e5e5ab8263c1eb23455c64e3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 21:14:58 -0600 Subject: [PATCH 61/98] adding testing workflows --- .github/workflows/collect-results.yml | 8 -------- 1 file changed, 8 deletions(-) diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index b37d789a0..e4df99d1b 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -39,11 +39,3 @@ jobs: with: name: results_${{ inputs.exp-name || 'all' }} path: agg_${{ inputs.exp-name || 'all' }}.json - - - name: Upload performance graphs - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 - with: - name: graphs_${{ inputs.exp-name || 'all' }} - path: | - tput_vs_intvty_*_${{ inputs.exp-name || 'all' }}.png - tput_vs_e2el_*_${{ inputs.exp-name || 'all' }}.png From 3994996596d507e7f8fb63e66bc589aa0f3a2bff Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 21:15:19 -0600 Subject: [PATCH 62/98] adding testing workflows --- .github/configs/nvidia-master.yaml | 48 +++++++++++++++--------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9cae7ea11..e0578eba5 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1,27 +1,27 @@ -# dsr1-fp4-b200-sglang: -# image: lmsysorg/sglang:v0.5.5-cu129-amd64 -# model: nvidia/DeepSeek-R1-0528-FP4-V2 -# model-prefix: dsr1 -# runner: b200 -# precision: fp4 -# framework: sglang -# multinode: false -# seq-len-configs: -# - isl: 1024 -# osl: 1024 -# search-space: -# - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } -# - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } -# - isl: 1024 -# osl: 8192 -# search-space: -# - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } -# - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } -# - isl: 8192 -# osl: 1024 -# search-space: -# - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } -# - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } +dsr1-fp4-b200-sglang: + image: lmsysorg/sglang:v0.5.5-cu129-amd64 + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + runner: b200 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } # dsr1-fp4-b200-trt: # image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 From b8661786fd73b5fdc651d35b1edf1b21b4eace1b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 21:47:16 -0600 Subject: [PATCH 63/98] adding testing workflows --- .github/workflows/benchmark-tmpl.yml | 8 + utils/matrix-logic/generate_sweep_configs.py | 5 +- .../test_generate_sweep_configs.py | 2264 ++++++----------- utils/matrix-logic/test_validation.py | 1282 ++++++++++ utils/matrix-logic/validation.py | 17 +- 5 files changed, 2036 insertions(+), 1540 deletions(-) create mode 100644 utils/matrix-logic/test_validation.py diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index 2ea94ed55..edf8454a0 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -41,6 +41,12 @@ on: conc: required: true type: string + spec-decoding: + required: true + type: string + disagg: + required: true + type: string random-range-ratio: required: false type: string @@ -62,6 +68,8 @@ env: EP_SIZE: ${{ inputs.ep }} DP_ATTENTION: ${{ inputs.dp-attn }} CONC: ${{ inputs.conc }} + SPEC_DECODING: ${{ inputs.spec-decoding }} + DISAGG: ${{ inputs.disagg }} permissions: contents: read diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 3943c317d..be87e5e25 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -156,6 +156,7 @@ def generate_full_sweep(args, all_config_data, runner_data): conc_end = bmk[Fields.CONC_END.value] ep = bmk.get(Fields.EP.value) dp_attn = bmk.get(Fields.DP_ATTN.value) + spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none") # Apply max-tp filter if specified if args.max_tp and tp > args.max_tp: @@ -187,6 +188,7 @@ def generate_full_sweep(args, all_config_data, runner_data): Fields.MAX_MODEL_LEN.value: isl + osl + 200, Fields.EP.value: 1, # Default Fields.DP_ATTN.value: False, # Default + Fields.SPEC_DECODING.value: spec_decoding, Fields.EXP_NAME.value: f"{model_code}_{seq_len_str}", Fields.DISAGG.value: disagg, } @@ -217,7 +219,7 @@ def generate_runner_model_sweep_config(args, all_config_data, runner_data): if not runner_nodes: raise ValueError( - f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_config.keys())}'.") + f"Runner '{args.runner_type}' does not exist in runner config '{args.runner_config}'. Must choose from existing runner types: '{', '.join(runner_data.keys())}'.") # Filter runner nodes if filter is specified if args.runner_node_filter: @@ -269,6 +271,7 @@ def generate_runner_model_sweep_config(args, all_config_data, runner_data): Fields.TP.value: highest_tp, Fields.EP.value: 1, # Default, Fields.DP_ATTN.value: False, # Default + Fields.SPEC_DECODING.value: "none", # Default Fields.CONC.value: lowest_conc, Fields.MAX_MODEL_LEN.value: 2048, Fields.EXP_NAME.value: f"{model_code}_test", diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py index c184ecbab..917457852 100644 --- a/utils/matrix-logic/test_generate_sweep_configs.py +++ b/utils/matrix-logic/test_generate_sweep_configs.py @@ -2,80 +2,73 @@ import yaml from unittest.mock import patch from generate_sweep_configs import ( - validate_master_configs_structure, - validate_matrix_output, seq_len_to_str, generate_full_sweep, - generate_test_config, generate_runner_model_sweep_config, - generate_runner_sweep_config, - generate_custom_test, load_config_files, + load_runner_file, main, - MatrixEntry, ) -# Fixtures for test config files +# ============================================================================ +# Fixtures +# ============================================================================ + @pytest.fixture -def sample_master_config(): - """Sample master config with valid entries.""" +def sample_single_node_config(): + """Sample master config with single-node entries.""" return { - "70b-fp8-vllm": { - "image": "vllm/vllm-openai:v0.10.2", - "model": "meta-llama/Llama-3-70b", - "model-prefix": "70b", - "precision": "fp8", - "framework": "vllm", + "dsr1-fp8-h200-sglang": { + "image": "lmsysorg/sglang:v0.5.5-cu129-amd64", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", "runner": "h200", + "precision": "fp8", + "framework": "sglang", + "multinode": False, "seq-len-configs": [ { "isl": 1024, "osl": 1024, "search-space": [ - {"tp": 4, "conc-start": 1, "conc-end": 4}, - {"tp": 8, "conc-start": 2, "conc-end": 8, "ep": 2, "dp-attn": True} + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64, "ep": 2, "dp-attn": True} ] }, { "isl": 1024, "osl": 8192, "search-space": [ - {"tp": 8, "conc-start": 1, "conc-end": 2} + {"tp": 8, "conc-start": 4, "conc-end": 32} ] - } - ] - }, - "8b-fp4-trt": { - "image": "nvcr.io/nvidia/tritonserver:24.01", - "model": "meta-llama/Llama-3-8b", - "model-prefix": "8b", - "precision": "fp4", - "framework": "trt", - "runner": "h100", - "seq-len-configs": [ + }, { - "isl": 1024, + "isl": 8192, "osl": 1024, "search-space": [ - {"tp": 2, "conc-start": 4, "conc-end": 16} + {"tp": 8, "conc-start": 4, "conc-end": 16} ] } ] }, - "gptoss-120b-fp8-vllm": { - "image": "vllm/vllm-openai:latest", + "gptoss-fp4-b200-vllm": { + "image": "vllm/vllm-openai:v0.11.0", "model": "openai/gpt-oss-120b", "model-prefix": "gptoss", - "precision": "fp8", + "runner": "b200", + "precision": "fp4", "framework": "vllm", - "runner": "h200-trt", + "multinode": False, "seq-len-configs": [ { "isl": 1024, "osl": 1024, "search-space": [ - {"tp": 8, "conc-start": 1, "conc-end": 4} + {"tp": 1, "conc-start": 4, "conc-end": 128}, + {"tp": 2, "conc-start": 4, "conc-end": 128}, + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 8} ] } ] @@ -84,1573 +77,784 @@ def sample_master_config(): @pytest.fixture -def sample_runner_config(): - """Sample runner config.""" - return { - "h200": ["h200-nv_1", "h200-nv_2"], - "h100": ["h100-aws_1"], - "h200-trt": ["h200-trt_1", "h200-trt_2", "h200-trt_3"] - } - - -@pytest.fixture -def temp_config_files(tmp_path, sample_master_config, sample_runner_config): - """Create temporary config files.""" - master_file = tmp_path / "master.yaml" - runner_file = tmp_path / "runners.yaml" - - with open(master_file, 'w') as f: - yaml.dump(sample_master_config, f) - - with open(runner_file, 'w') as f: - yaml.dump(sample_runner_config, f) - - return str(master_file), str(runner_file) - - -@pytest.fixture -def invalid_master_config(): - """Master config with validation errors.""" +def sample_multinode_config(): + """Sample master config with multinode entries.""" return { - "missing-field": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - # Missing precision, framework, runner, seq-len-configs - } - } - - -# Tests for seq_len_to_str -def test_seq_len_to_str_with_mapping(): - """Test seq_len_to_str with known mappings.""" - assert seq_len_to_str(1024, 1024) == "1k1k" - assert seq_len_to_str(1024, 8192) == "1k8k" - assert seq_len_to_str(8192, 1024) == "8k1k" - - -def test_seq_len_to_str_without_mapping(): - """Test seq_len_to_str fallback for unknown mappings.""" - assert seq_len_to_str(2048, 4096) == "2048_4096" - assert seq_len_to_str(512, 512) == "512_512" - - -# Tests for MatrixEntry validation -def test_matrix_entry_valid(): - """Test valid MatrixEntry.""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp" - } - result = MatrixEntry(**entry) - assert result.image == "test:latest" - assert result.tp == 8 - - -def test_matrix_entry_missing_field(): - """Test MatrixEntry with missing required field.""" - entry = { - "image": "test:latest", - "model": "test/model", - # Missing other required fields - } - with pytest.raises(Exception): # Pydantic ValidationError - MatrixEntry(**entry) - - -def test_matrix_entry_wrong_type(): - """Test MatrixEntry with wrong type.""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": "not_an_int", # Wrong type - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp" - } - with pytest.raises(Exception): # Pydantic ValidationError - MatrixEntry(**entry) - - -def test_matrix_entry_extra_field(): - """Test MatrixEntry with extra field (should be forbidden).""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp", - "extra-field": "should_fail" - } - with pytest.raises(Exception): # Pydantic ValidationError - MatrixEntry(**entry) - - -# Tests for validate_matrix_output -def test_validate_matrix_output_valid(): - """Test validate_matrix_output with valid entries.""" - entries = [ - { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp" - } - ] - result = validate_matrix_output(entries) - assert result == entries - - -def test_validate_matrix_output_invalid(): - """Test validate_matrix_output with invalid entry.""" - entries = [ - { - "image": "test:latest", - "model": "test/model", - # Missing required fields - } - ] - with pytest.raises(ValueError, match="Matrix entry at index 0 failed validation"): - validate_matrix_output(entries) - - -def test_validate_matrix_output_multiple_entries(): - """Test validate_matrix_output with multiple entries.""" - entries = [ - { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2048, - "exp-name": "test_exp" - }, - { - "image": "test2:latest", - "model": "test2/model", + "dsr1-fp4-gb200-dynamo-trt": { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "runner": "gb200", "precision": "fp4", - "framework": "trt", - "runner": "h100", - "isl": 1024, - "osl": 1024, - "tp": 4, - "ep": 2, - "dp-attn": True, - "conc": 8, - "max-model-len": 2048, - "exp-name": "test_exp2" - } - ] - result = validate_matrix_output(entries) - assert len(result) == 2 - - -# Tests for validate_master_configs_structure -def test_validate_master_configs_structure_valid(sample_master_config): - """Test validation of valid master config.""" - validate_master_configs_structure(sample_master_config) - - -def test_validate_master_configs_structure_missing_field(): - """Test validation with missing required field.""" - config = { - "test-key": { - "image": "test:latest", - "model-prefix": "test", - # Missing other required fields - } - } - with pytest.raises(ValueError, match="Missing required field"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_type(): - """Test validation with wrong field type.""" - config = { - "test-key": { - "image": 123, # Should be string - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [] - } - } - with pytest.raises(ValueError, match="must be str"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_empty_seq_len_configs(): - """Test validation with empty seq-len-configs.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [] - } - } - with pytest.raises(ValueError, match="must be a non-empty list"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_invalid_search_space(): - """Test validation with invalid search-space.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 8} # Missing conc-start and conc-end - ] - } - ] - } - } - with pytest.raises(ValueError, match="Missing 'conc-start'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_missing_search_space(): - """Test validation with missing search-space.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024 - # Missing search-space - } - ] - } - } - with pytest.raises(ValueError, match="Missing or invalid 'search-space'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_search_space_not_list(): - """Test validation with search-space not being a list.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": "not_a_list" - } - ] - } - } - with pytest.raises(ValueError, match="Missing or invalid 'search-space'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_extra_fields_in_search_space(): - """Test validation with extra fields in search-space.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", + "framework": "dynamo-trt", + "multinode": True, + "disagg": True, "seq-len-configs": [ { "isl": 1024, "osl": 1024, "search-space": [ { - "tp": 8, - "conc-start": 1, - "conc-end": 4, - "invalid-field": "value" + "spec-decoding": "mtp", + "conc-list": [1, 2, 4, 8, 16, 36], + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + "additional-settings": ["PREFILL_MAX_NUM_TOKENS=4608"] + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False, + "additional-settings": ["DECODE_MAX_NUM_TOKENS=128"] + } + }, + { + "conc-list": [64, 128], + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": True, + "additional-settings": [] + }, + "decode": { + "num-worker": 1, + "tp": 16, + "ep": 16, + "dp-attn": True, + "additional-settings": [] + } } ] } ] } } - with pytest.raises(ValueError, match="Extra fields"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_missing_isl(): - """Test validation with missing isl.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="Missing 'isl'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_isl_type(): - """Test validation with wrong isl type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": "not_int", - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="'isl' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_missing_osl(): - """Test validation with missing osl.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="Missing 'osl'"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_osl_type(): - """Test validation with wrong osl type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": "not_int", - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="'osl' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_tp_type(): - """Test validation with wrong tp type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": "not_int", "conc-start": 1, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="'tp' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_conc_start_type(): - """Test validation with wrong conc-start type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": "not_int", "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="'conc-start' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_conc_end_type(): - """Test validation with wrong conc-end type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": "not_int"}] - } - ] - } - } - with pytest.raises(ValueError, match="'conc-end' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_ep_type(): - """Test validation with wrong ep type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4, "ep": "not_int"}] - } - ] - } - } - with pytest.raises(ValueError, match="'ep' must be int"): - validate_master_configs_structure(config) - - -def test_validate_master_configs_structure_wrong_dp_attn_type(): - """Test validation with wrong dp-attn type.""" - config = { - "test-key": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 8, "conc-start": 1, "conc-end": 4, "dp-attn": "not_bool"}] - } - ] - } - } - with pytest.raises(ValueError, match="'dp-attn' must be bool"): - validate_master_configs_structure(config) - - -# Tests for load_config_files -def test_load_config_files_valid(temp_config_files): - """Test loading valid config files.""" - master_file, _ = temp_config_files - result = load_config_files([master_file]) - assert len(result) == 3 - assert "70b-fp8-vllm" in result - - -def test_load_config_files_multiple(tmp_path, sample_master_config): - """Test loading multiple config files.""" - file1 = tmp_path / "config1.yaml" - file2 = tmp_path / "config2.yaml" - - config1 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]} - config2 = {"8b-fp4-trt": sample_master_config["8b-fp4-trt"]} - - with open(file1, 'w') as f: - yaml.dump(config1, f) - with open(file2, 'w') as f: - yaml.dump(config2, f) - - result = load_config_files([str(file1), str(file2)]) - assert len(result) == 2 - - -def test_load_config_files_not_found(): - """Test loading non-existent config file.""" - with pytest.raises(ValueError, match="does not exist"): - load_config_files(["/nonexistent/file.yaml"]) - - -def test_load_config_files_duplicate_keys(tmp_path, sample_master_config): - """Test loading files with duplicate keys.""" - file1 = tmp_path / "config1.yaml" - file2 = tmp_path / "config2.yaml" - - config1 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]} - config2 = {"70b-fp8-vllm": sample_master_config["70b-fp8-vllm"]} # Duplicate - - with open(file1, 'w') as f: - yaml.dump(config1, f) - with open(file2, 'w') as f: - yaml.dump(config2, f) - - with pytest.raises(ValueError, match="Duplicate configuration keys"): - load_config_files([str(file1), str(file2)]) - - -# Tests for generate_full_sweep -def test_generate_full_sweep_basic(sample_master_config, temp_config_files): - """Test basic full sweep generation.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert len(result) > 0 - assert all(entry['exp-name'].startswith('70b_1k1k') for entry in result) - assert all(entry['isl'] == 1024 and entry['osl'] == 1024 for entry in result) - - -def test_generate_full_sweep_with_optionals(sample_master_config, temp_config_files): - """Test full sweep with optional ep and dp-attn.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # Find entry with tp=8 which should have ep=2 and dp-attn=True - tp8_entries = [e for e in result if e['tp'] == 8] - assert len(tp8_entries) > 0 - assert all(e['ep'] == 2 for e in tp8_entries) - assert all(e['dp-attn'] == True for e in tp8_entries) - - -def test_generate_full_sweep_no_matches(sample_master_config, temp_config_files): - """Test full sweep with no matching configs.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["nonexistent"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - with pytest.raises(ValueError, match="No configs found matching filters"): - generate_full_sweep(Args(), sample_master_config) - - -def test_generate_full_sweep_different_seq_len(sample_master_config, temp_config_files): - """Test full sweep with different sequence length.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - seq_lens = ["1k8k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert len(result) > 0 - assert all(entry['isl'] == 1024 and entry['osl'] == 8192 for entry in result) - - -def test_generate_full_sweep_step_size(sample_master_config, temp_config_files): - """Test full sweep with different step size.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["8b"] - seq_lens = ["1k1k"] - step_size = 4 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # Should have entries at conc=4, 8, 16 (step_size=4, conc-start=4, conc-end=16) - conc_values = sorted(set(e['conc'] for e in result)) - assert 4 in conc_values - assert 16 in conc_values - - -def test_generate_full_sweep_seq_len_not_in_config(temp_config_files): - """Test full sweep when requested seq-len is not in config.""" - _, runner_file = temp_config_files - - config = { - "test-fp8-vllm": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 8192, - "osl": 1024, # Only has 8k1k, not 1k1k - "search-space": [ - {"tp": 4, "conc-start": 1, "conc-end": 4} - ] - } - ] - } - } - - class Args: - model_prefix = ["test"] - seq_lens = ["1k1k"] # Requesting 1k1k but config only has 8k1k - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - # Should raise error since no matching seq-len - with pytest.raises(ValueError, match="No configs found matching filters"): - generate_full_sweep(Args(), config) - - -def test_generate_full_sweep_concurrency_overshoot(temp_config_files): - """Test full sweep when concurrency step overshoots end value.""" - _, runner_file = temp_config_files - - config = { - "test-fp8-vllm": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 1, "conc-end": 5} # 1, 3*2=6 overshoots, clamps to 5 - ] - } - ] - } - } - class Args: - model_prefix = ["test"] - seq_lens = ["1k1k"] - step_size = 3 # Will overshoot: 1, 3, 9 (clamped to 5) - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), config) - conc_values = sorted(set(e['conc'] for e in result)) - # Should have 1, 3, 5 (5 is the clamped value) - assert conc_values == [1, 3, 5] - - -# Tests for generate_full_sweep with filters -def test_generate_full_sweep_no_filters(sample_master_config, temp_config_files): - """Test filtered sweep with no filters.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = None - precision = None - framework = None - runner_type = None - seq_lens = None - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert len(result) > 0 - - -def test_generate_full_sweep_with_filters_model_prefix(sample_master_config, temp_config_files): - """Test filtered sweep with model prefix filter.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - precision = None - framework = None - runner_type = None - seq_lens = None - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert all("70b" in entry['exp-name'] for entry in result) - - -def test_generate_full_sweep_with_filters_multiple_filters(sample_master_config, temp_config_files): - """Test filtered sweep with multiple filters.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - precision = ["fp8"] - framework = ["vllm"] - runner_type = None - seq_lens = ["1k1k"] - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - assert len(result) > 0 - assert all(entry['precision'] == 'fp8' for entry in result) - assert all(entry['framework'] == 'vllm' for entry in result) - - -def test_generate_full_sweep_with_filters_test_mode(sample_master_config, temp_config_files): - """Test filtered sweep in test mode.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - precision = None - framework = None - runner_type = None - seq_lens = ["1k1k"] - step_size = 2 - test_mode = True - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # In test mode, should only get one entry per seq-len (highest TP, lowest conc) - assert len(result) == 1 # Only one config matches 70b with 1k1k - assert result[0]['tp'] == 8 # Highest TP - assert '70b_1k1k' in result[0]['exp-name'] - - -def test_generate_full_sweep_with_filters_runner_type_validation(sample_master_config, temp_config_files): - """Test filtered sweep with invalid runner type.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = None - precision = None - framework = None - runner_type = ["invalid-runner"] - seq_lens = None - step_size = 2 - test_mode = False - runner_config = runner_file - - with pytest.raises(ValueError, match="Invalid runner type"): - generate_full_sweep(Args(), sample_master_config) - - -def test_generate_full_sweep_with_filters_runner_type_no_config(sample_master_config): - """Test filtered sweep with runner type but no config file.""" - class Args: - model_prefix = None - precision = None - framework = None - runner_type = ["h200"] - seq_lens = None - step_size = 2 - test_mode = False - runner_config = None - - with pytest.raises(ValueError, match="runner-config is required"): - generate_full_sweep(Args(), sample_master_config) - - -def test_generate_full_sweep_with_filters_multiple_runner_types(sample_master_config, temp_config_files): - """Test filtered sweep with multiple runner types.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = None - precision = None - framework = None - runner_type = ["h200", "h100"] - seq_lens = ["1k1k"] - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - runners = set(entry['runner'] for entry in result) - assert 'h200' in runners or 'h100' in runners - - -def test_generate_full_sweep_with_filters_no_matches(sample_master_config, temp_config_files): - """Test filtered sweep with no matching configs.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["nonexistent"] - precision = None - framework = None - runner_type = None - seq_lens = None - step_size = 2 - test_mode = False - runner_config = runner_file - - with pytest.raises(ValueError, match="No configs found matching filters"): - generate_full_sweep(Args(), sample_master_config) - - -def test_generate_full_sweep_with_filters_concurrency_overshoot(temp_config_files): - """Test filtered sweep when concurrency step overshoots end value.""" - _, runner_file = temp_config_files - - config = { - "test-fp8-vllm": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 2, "conc-end": 7} # 2, 8 overshoots, clamps to 7 - ] - } - ] - } - } - class Args: - model_prefix = None - precision = None - framework = None - runner_type = None - seq_lens = None - step_size = 4 # Will overshoot: 2, 8 (clamped to 7) - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), config) - conc_values = sorted(set(e['conc'] for e in result)) - # Should have 2, 7 (7 is the clamped value) - assert 2 in conc_values - assert 7 in conc_values - - -# Tests for generate_test_config -def test_generate_test_config_basic(sample_master_config, temp_config_files): - """Test basic test config generation.""" - _, runner_file = temp_config_files - - class Args: - key = "70b-fp8-vllm" - runner_config = runner_file - runner_node = "h200-nv_1" - seq_lens = None - step_size = 2 - test_mode = False - - result = generate_test_config(Args(), sample_master_config) - assert len(result) > 0 - - -def test_generate_test_config_test_mode(sample_master_config, temp_config_files): - """Test test config in test mode.""" - _, runner_file = temp_config_files - - class Args: - key = "70b-fp8-vllm" - runner_config = runner_file - runner_node = "h200-nv_1" - seq_lens = ["1k1k"] - step_size = 2 - test_mode = True - - result = generate_test_config(Args(), sample_master_config) - # In test mode, should only use lowest concurrency - assert all(entry['conc'] == 1 or entry['conc'] == 2 for entry in result) - - -def test_generate_test_config_specific_runner_node(sample_master_config, temp_config_files): - """Test test config with specific runner node.""" - _, runner_file = temp_config_files - - class Args: - key = "70b-fp8-vllm" - runner_config = runner_file - runner_node = "h200-nv_1" - seq_lens = None - step_size = 2 - test_mode = False - - result = generate_test_config(Args(), sample_master_config) - assert all(entry['runner'] == 'h200-nv_1' for entry in result) - - -def test_generate_test_config_invalid_key(sample_master_config, temp_config_files): - """Test test config with invalid key.""" - _, runner_file = temp_config_files - - class Args: - key = "nonexistent-key" - runner_config = runner_file - runner_node = None - seq_lens = None - step_size = 2 - test_mode = False - - with pytest.raises(ValueError, match="does not exist in config files"): - generate_test_config(Args(), sample_master_config) - - -def test_generate_test_config_invalid_runner_node(sample_master_config, temp_config_files): - """Test test config with invalid runner node.""" - _, runner_file = temp_config_files - - class Args: - key = "70b-fp8-vllm" - runner_config = runner_file - runner_node = "invalid-node" - seq_lens = None - step_size = 2 - test_mode = False - - with pytest.raises(ValueError, match="is not compatible"): - generate_test_config(Args(), sample_master_config) - - -def test_generate_test_config_missing_runner_config(sample_master_config): - """Test test config with missing runner config file.""" - class Args: - key = "70b-fp8-vllm" - runner_config = "/nonexistent/file.yaml" - runner_node = None - seq_lens = None - step_size = 2 - test_mode = False - - with pytest.raises(ValueError, match="does not exist"): - generate_test_config(Args(), sample_master_config) - - -def test_generate_test_config_concurrency_overshoot(temp_config_files): - """Test test config when concurrency step overshoots end value.""" - _, runner_file = temp_config_files - - config = { - "test-fp8-vllm": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 1, "conc-end": 6} - ] - } - ] - } +@pytest.fixture +def sample_runner_config(): + """Sample runner config.""" + return { + "h200": ["h200-nv_1", "h200-nv_2"], + "b200": ["b200-nv_1"], + "gb200": ["gb200-nv_1", "gb200-nv_2", "gb200-nv_3"], + "h100": ["h100-aws_1"] } - class Args: - key = "test-fp8-vllm" - runner_config = runner_file - runner_node = "h200-nv_1" - seq_lens = None - step_size = 4 # Will overshoot: 1, 4, 16 (clamped to 6) - test_mode = False - - result = generate_test_config(Args(), config) - conc_values = sorted(set(e['conc'] for e in result)) - assert 1 in conc_values - assert 4 in conc_values - assert 6 in conc_values - - -# Tests for generate_runner_model_sweep_config -def test_generate_runner_model_sweep_config(sample_master_config, temp_config_files): - """Test runner-model sweep config generation.""" - _, runner_file = temp_config_files - - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = None - result = generate_runner_model_sweep_config(Args(), sample_master_config) - assert len(result) > 0 - # Should have entries for each runner node under h200 - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' in runners - - -def test_generate_runner_model_sweep_config_invalid_runner(sample_master_config, temp_config_files): - """Test runner-model sweep with invalid runner type.""" - _, runner_file = temp_config_files - - class Args: - runner_type = "invalid-runner" - runner_config = runner_file - runner_node_filter = None - - with pytest.raises(ValueError, match="does not exist in runner config"): - generate_runner_model_sweep_config(Args(), sample_master_config) +@pytest.fixture +def temp_config_files(tmp_path, sample_single_node_config, sample_runner_config): + """Create temporary config files for single-node tests.""" + master_file = tmp_path / "master.yaml" + runner_file = tmp_path / "runners.yaml" + with open(master_file, 'w') as f: + yaml.dump(sample_single_node_config, f) -def test_generate_runner_model_sweep_config_with_node_filter(sample_master_config, temp_config_files): - """Test runner-model sweep with runner node filter.""" - _, runner_file = temp_config_files + with open(runner_file, 'w') as f: + yaml.dump(sample_runner_config, f) - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = "nv_1" + return str(master_file), str(runner_file) - result = generate_runner_model_sweep_config(Args(), sample_master_config) - # Should only have entries for h200-nv_1 - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' not in runners +@pytest.fixture +def temp_multinode_config_files(tmp_path, sample_multinode_config, sample_runner_config): + """Create temporary config files for multinode tests.""" + master_file = tmp_path / "master.yaml" + runner_file = tmp_path / "runners.yaml" -def test_generate_runner_model_sweep_config_with_node_filter_multiple_matches(sample_master_config, temp_config_files): - """Test runner-model sweep with runner node filter matching multiple nodes.""" - _, runner_file = temp_config_files + with open(master_file, 'w') as f: + yaml.dump(sample_multinode_config, f) - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = "nv" # Should match both nv_1 and nv_2 + with open(runner_file, 'w') as f: + yaml.dump(sample_runner_config, f) - result = generate_runner_model_sweep_config(Args(), sample_master_config) - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' in runners + return str(master_file), str(runner_file) -def test_generate_runner_model_sweep_config_with_node_filter_no_matches(sample_master_config, temp_config_files): - """Test runner-model sweep with runner node filter that matches no nodes.""" - _, runner_file = temp_config_files +# ============================================================================ +# Helper class for mocking args +# ============================================================================ + +class MockArgs: + """Mock args object for testing functions.""" + def __init__(self, **kwargs): + # Defaults + self.model_prefix = None + self.precision = None + self.framework = None + self.runner_type = None + self.seq_lens = None + self.step_size = 2 + self.max_conc = None + self.max_tp = None + self.max_ep = None + self.single_node = False + self.multi_node = False + self.runner_config = None + self.runner_node_filter = None + + # Override with provided kwargs + for key, value in kwargs.items(): + setattr(self, key, value) + + +# ============================================================================ +# Tests for seq_len_to_str +# ============================================================================ - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = "nonexistent" +class TestSeqLenToStr: + """Tests for the seq_len_to_str function.""" - with pytest.raises(ValueError, match="No runner nodes found matching filter"): - generate_runner_model_sweep_config(Args(), sample_master_config) + def test_known_mapping_1k1k(self): + assert seq_len_to_str(1024, 1024) == "1k1k" + def test_known_mapping_1k8k(self): + assert seq_len_to_str(1024, 8192) == "1k8k" -def test_generate_runner_model_sweep_config_without_node_filter(sample_master_config, temp_config_files): - """Test runner-model sweep without runner node filter (default behavior).""" - _, runner_file = temp_config_files + def test_known_mapping_8k1k(self): + assert seq_len_to_str(8192, 1024) == "8k1k" - class Args: - runner_type = "h200" - runner_config = runner_file - runner_node_filter = None + def test_unknown_mapping_fallback(self): + assert seq_len_to_str(2048, 4096) == "2048_4096" - result = generate_runner_model_sweep_config(Args(), sample_master_config) - # Should have entries for all h200 nodes - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' in runners + def test_unknown_mapping_small_values(self): + assert seq_len_to_str(512, 512) == "512_512" -# Tests for generate_runner_sweep_config -def test_generate_runner_sweep_config(sample_master_config, temp_config_files): - """Test runner sweep config generation.""" - _, runner_file = temp_config_files +# ============================================================================ +# Tests for load_config_files +# ============================================================================ - class Args: - model_prefix = "70b" - runner_type = "h200" - precision = None - framework = None - runner_config = runner_file +class TestLoadConfigFiles: + """Tests for the load_config_files function.""" - result = generate_runner_sweep_config(Args(), sample_master_config) - assert len(result) > 0 + def test_load_single_valid_file(self, temp_config_files): + master_file, _ = temp_config_files + result = load_config_files([master_file]) + assert len(result) == 2 + assert "dsr1-fp8-h200-sglang" in result + assert "gptoss-fp4-b200-vllm" in result + def test_load_multiple_files(self, tmp_path, sample_single_node_config): + file1 = tmp_path / "config1.yaml" + file2 = tmp_path / "config2.yaml" -def test_generate_runner_sweep_config_with_filters(sample_master_config, temp_config_files): - """Test runner sweep with precision and framework filters.""" - _, runner_file = temp_config_files + config1 = {"dsr1-fp8-h200-sglang": sample_single_node_config["dsr1-fp8-h200-sglang"]} + config2 = {"gptoss-fp4-b200-vllm": sample_single_node_config["gptoss-fp4-b200-vllm"]} - class Args: - model_prefix = "70b" - runner_type = "h200" - precision = "fp8" - framework = "vllm" - runner_config = runner_file + with open(file1, 'w') as f: + yaml.dump(config1, f) + with open(file2, 'w') as f: + yaml.dump(config2, f) - result = generate_runner_sweep_config(Args(), sample_master_config) - assert all(entry['precision'] == 'fp8' for entry in result) - assert all(entry['framework'] == 'vllm' for entry in result) + result = load_config_files([str(file1), str(file2)]) + assert len(result) == 2 + def test_load_nonexistent_file(self): + with pytest.raises(ValueError, match="does not exist"): + load_config_files(["/nonexistent/file.yaml"]) -def test_generate_runner_sweep_config_no_matches(sample_master_config, temp_config_files): - """Test runner sweep with no matching configs.""" - _, runner_file = temp_config_files + def test_load_files_with_duplicate_keys(self, tmp_path, sample_single_node_config): + file1 = tmp_path / "config1.yaml" + file2 = tmp_path / "config2.yaml" - class Args: - model_prefix = "nonexistent" - runner_type = "h200" - precision = None - framework = None - runner_config = runner_file + config = {"dsr1-fp8-h200-sglang": sample_single_node_config["dsr1-fp8-h200-sglang"]} - with pytest.raises(ValueError, match="No configs found matching"): - generate_runner_sweep_config(Args(), sample_master_config) + with open(file1, 'w') as f: + yaml.dump(config, f) + with open(file2, 'w') as f: + yaml.dump(config, f) + with pytest.raises(ValueError, match="Duplicate configuration keys"): + load_config_files([str(file1), str(file2)]) -# Tests for generate_custom_test -def test_generate_custom_test(temp_config_files): - """Test custom test generation.""" - _, runner_file = temp_config_files - class Args: - runner_label = "h200" - image = "vllm/vllm-openai:latest" - model = "test/model" - framework = "vllm" - precision = "fp8" - exp_name = "custom_test" - runner_config = runner_file +# ============================================================================ +# Tests for load_runner_file +# ============================================================================ - result = generate_custom_test(Args()) - assert len(result) == 1 - assert result[0]['image'] == "vllm/vllm-openai:latest" - assert result[0]['exp-name'] == "custom_test" +class TestLoadRunnerFile: + """Tests for the load_runner_file function.""" + def test_load_valid_runner_file(self, temp_config_files): + _, runner_file = temp_config_files + result = load_runner_file(runner_file) + assert "h200" in result + assert "b200" in result -def test_generate_custom_test_invalid_runner(temp_config_files): - """Test custom test with invalid runner label.""" - _, runner_file = temp_config_files + def test_load_nonexistent_runner_file(self): + with pytest.raises(ValueError, match="does not exist"): + load_runner_file("/nonexistent/runners.yaml") - class Args: - runner_label = "invalid-runner" - image = "vllm/vllm-openai:latest" - model = "test/model" - framework = "vllm" - precision = "fp8" - exp_name = "custom_test" - runner_config = runner_file - with pytest.raises(ValueError, match="Unable to find specified runner label"): - generate_custom_test(Args()) +# ============================================================================ +# Tests for generate_full_sweep - Single Node +# ============================================================================ +class TestGenerateFullSweepSingleNode: + """Tests for generate_full_sweep with single-node configurations.""" -# Tests for main function -def test_main_full_sweep(temp_config_files): - """Test main function with full-sweep command.""" - master_file, _ = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", master_file, - "--seq-lens", "1k1k", - "--model-prefix", "70b", - "--step-size", "2" - ] - - with patch('sys.argv', test_args): - result = main() + def test_basic_sweep(self, sample_single_node_config, sample_runner_config): + args = MockArgs( + model_prefix=["dsr1"], + seq_lens=["1k1k"], + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) assert len(result) > 0 + assert all(entry['isl'] == 1024 and entry['osl'] == 1024 for entry in result) - -def test_main_full_sweep_with_filters(temp_config_files): - """Test main function with full-sweep command with filters.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--model-prefix", "70b", - "--precision", "fp8", - "--test-mode" - ] - - with patch('sys.argv', test_args): - result = main() + def test_sweep_no_filters(self, sample_single_node_config, sample_runner_config): + args = MockArgs(single_node=True) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) assert len(result) > 0 - -def test_main_test_config(temp_config_files): - """Test main function with test-config command.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "test-config", - "--config-files", master_file, - "--runner-config", runner_file, - "--key", "70b-fp8-vllm", - "--runner-node", "h200-nv_1", - "--test-mode" - ] - - with patch('sys.argv', test_args): - result = main() + def test_sweep_returns_empty_when_no_matches(self, sample_single_node_config, sample_runner_config): + args = MockArgs( + model_prefix=["nonexistent"], + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + assert result == [] + + def test_filter_by_precision(self, sample_single_node_config, sample_runner_config): + args = MockArgs( + precision=["fp8"], + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + assert all(entry['precision'] == 'fp8' for entry in result) + + def test_filter_by_framework(self, sample_single_node_config, sample_runner_config): + args = MockArgs( + framework=["vllm"], + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + assert all(entry['framework'] == 'vllm' for entry in result) + + def test_filter_by_runner_type(self, sample_single_node_config, sample_runner_config): + args = MockArgs( + runner_type=["h200"], + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + assert all(entry['runner'] == 'h200' for entry in result) + + def test_invalid_runner_type_raises_error(self, sample_single_node_config, sample_runner_config): + args = MockArgs( + runner_type=["invalid-runner"], + single_node=True + ) + with pytest.raises(ValueError, match="Invalid runner type"): + generate_full_sweep(args, sample_single_node_config, sample_runner_config) + + def test_multiple_runner_types(self, sample_single_node_config, sample_runner_config): + args = MockArgs( + runner_type=["h200", "b200"], + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + runners = set(entry['runner'] for entry in result) + assert 'h200' in runners or 'b200' in runners + + def test_filter_by_seq_lens(self, sample_single_node_config, sample_runner_config): + args = MockArgs( + model_prefix=["dsr1"], + seq_lens=["1k8k"], + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + assert all(entry['isl'] == 1024 and entry['osl'] == 8192 for entry in result) + + def test_filter_multiple_seq_lens(self, sample_single_node_config, sample_runner_config): + args = MockArgs( + model_prefix=["dsr1"], + seq_lens=["1k1k", "8k1k"], + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + seq_lens = set((e['isl'], e['osl']) for e in result) + assert (1024, 1024) in seq_lens + assert (8192, 1024) in seq_lens + + def test_step_size(self, sample_single_node_config, sample_runner_config): + args = MockArgs( + model_prefix=["gptoss"], + seq_lens=["1k1k"], + step_size=4, + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + # With step_size=4, starting from 4: 4, 16, 64, 128 (or clamped) + conc_values = set(e['conc'] for e in result) + assert 4 in conc_values + + def test_max_conc_filter(self, sample_single_node_config, sample_runner_config): + args = MockArgs( + model_prefix=["dsr1"], + seq_lens=["1k1k"], + max_conc=16, + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + assert all(entry['conc'] <= 16 for entry in result) + + def test_max_tp_filter(self, sample_single_node_config, sample_runner_config): + args = MockArgs( + model_prefix=["dsr1"], + seq_lens=["1k1k"], + max_tp=4, + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + assert all(entry['tp'] <= 4 for entry in result) + + def test_max_ep_filter(self, sample_single_node_config, sample_runner_config): + args = MockArgs( + model_prefix=["dsr1"], + seq_lens=["1k1k"], + max_ep=1, + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + # Should exclude entries with ep > 1 + assert all(entry['ep'] <= 1 for entry in result) + + def test_concurrency_overshoot_clamped(self, sample_runner_config): + """Test that concurrency values are clamped to conc-end.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 4, "conc-start": 1, "conc-end": 5} + ] + } + ] + } + } + args = MockArgs(step_size=3, single_node=True) + result = generate_full_sweep(args, config, sample_runner_config) + conc_values = sorted(set(e['conc'] for e in result)) + # 1, 3, 9 -> clamped to 5 + assert conc_values == [1, 3, 5] + + def test_default_ep_dp_attn_values(self, sample_single_node_config, sample_runner_config): + """Test that entries without ep/dp-attn get default values.""" + args = MockArgs( + model_prefix=["dsr1"], + seq_lens=["1k1k"], + max_tp=4, # Filter to tp=4 which doesn't have ep/dp-attn + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + # tp=4 entries should have default ep=1 and dp-attn=False + for entry in result: + if entry['tp'] == 4: + assert entry['ep'] == 1 + assert entry['dp-attn'] == False + + def test_explicit_ep_dp_attn_values(self, sample_single_node_config, sample_runner_config): + """Test that entries with explicit ep/dp-attn use those values.""" + args = MockArgs( + model_prefix=["dsr1"], + seq_lens=["1k1k"], + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + # tp=8 entries should have ep=2 and dp-attn=True + tp8_entries = [e for e in result if e['tp'] == 8] + assert all(e['ep'] == 2 for e in tp8_entries) + assert all(e['dp-attn'] == True for e in tp8_entries) + + def test_max_model_len_calculation(self, sample_single_node_config, sample_runner_config): + """Test that max-model-len is calculated as isl + osl + 200.""" + args = MockArgs( + model_prefix=["dsr1"], + seq_lens=["1k8k"], + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + # isl=1024, osl=8192 -> max-model-len = 1024 + 8192 + 200 = 9416 + assert all(e['max-model-len'] == 9416 for e in result) + + def test_exp_name_format(self, sample_single_node_config, sample_runner_config): + """Test that exp-name follows the expected format.""" + args = MockArgs( + model_prefix=["dsr1"], + seq_lens=["1k1k"], + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + assert all(e['exp-name'] == 'dsr1_1k1k' for e in result) + + def test_disagg_defaults_to_false(self, sample_single_node_config, sample_runner_config): + """Test that disagg defaults to False when not specified.""" + args = MockArgs(single_node=True) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + assert all(e['disagg'] == False for e in result) + + def test_skips_multinode_configs_in_single_node_mode(self, sample_multinode_config, sample_runner_config): + """Test that multinode configs are skipped when --single-node is specified.""" + args = MockArgs(single_node=True) + result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) + assert result == [] + + +# ============================================================================ +# Tests for generate_full_sweep - Multi Node +# ============================================================================ + +class TestGenerateFullSweepMultiNode: + """Tests for generate_full_sweep with multinode configurations.""" + + def test_basic_multinode_sweep(self, sample_multinode_config, sample_runner_config): + args = MockArgs(multi_node=True) + result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) assert len(result) > 0 + def test_multinode_conc_is_list(self, sample_multinode_config, sample_runner_config): + """Test that multinode entries have conc as a list.""" + args = MockArgs(multi_node=True) + result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) + for entry in result: + assert isinstance(entry['conc'], list) + + def test_multinode_has_prefill_decode(self, sample_multinode_config, sample_runner_config): + """Test that multinode entries have prefill and decode configs.""" + args = MockArgs(multi_node=True) + result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) + for entry in result: + assert 'prefill' in entry + assert 'decode' in entry + + def test_multinode_spec_decoding_defaults_to_none(self, sample_multinode_config, sample_runner_config): + """Test that spec-decoding defaults to 'none' if not specified.""" + args = MockArgs(multi_node=True) + result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) + # The second search-space entry doesn't specify spec-decoding + for entry in result: + assert entry['spec-decoding'] in ['mtp', 'none'] + + def test_multinode_disagg_value(self, sample_multinode_config, sample_runner_config): + """Test that disagg is properly passed through.""" + args = MockArgs(multi_node=True) + result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) + # The sample config has disagg=True + assert all(e['disagg'] == True for e in result) + + def test_multinode_max_conc_filter(self, sample_multinode_config, sample_runner_config): + """Test max_conc filter works with multinode conc lists.""" + args = MockArgs(multi_node=True, max_conc=8) + result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) + for entry in result: + assert all(c <= 8 for c in entry['conc']) + + def test_multinode_max_conc_filters_out_empty(self, sample_multinode_config, sample_runner_config): + """Test that entries with no valid conc values are filtered out.""" + args = MockArgs(multi_node=True, max_conc=0) + result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) + assert result == [] + + def test_skips_single_node_configs_in_multi_node_mode(self, sample_single_node_config, sample_runner_config): + """Test that single-node configs are skipped when --multi-node is specified.""" + args = MockArgs(multi_node=True) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + assert result == [] + + +# ============================================================================ +# Tests for generate_runner_model_sweep_config +# ============================================================================ -def test_main_runner_model_sweep(temp_config_files): - """Test main function with runner-model-sweep command.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "runner-model-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-type", "h200" - ] +class TestGenerateRunnerModelSweepConfig: + """Tests for the generate_runner_model_sweep_config function.""" - with patch('sys.argv', test_args): - result = main() + def test_basic_runner_model_sweep(self, sample_single_node_config, sample_runner_config): + args = MockArgs(runner_type="h200") + result = generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) assert len(result) > 0 + runners = set(entry['runner'] for entry in result) + assert 'h200-nv_1' in runners + assert 'h200-nv_2' in runners + def test_invalid_runner_type(self, sample_single_node_config, sample_runner_config): + args = MockArgs(runner_type="invalid-runner") + with pytest.raises(ValueError, match="does not exist in runner config"): + generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) -def test_main_runner_model_sweep_with_node_filter(temp_config_files): - """Test main function with runner-model-sweep command with node filter.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "runner-model-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-type", "h200", - "--runner-node-filter", "nv_1" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 + def test_runner_node_filter(self, sample_single_node_config, sample_runner_config): + args = MockArgs(runner_type="h200", runner_node_filter="nv_1") + result = generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) runners = set(entry['runner'] for entry in result) assert 'h200-nv_1' in runners assert 'h200-nv_2' not in runners - -def test_main_runner_sweep(temp_config_files): - """Test main function with runner-sweep command.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "runner-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-type", "h200", - "--model-prefix", "70b" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - -def test_main_custom(temp_config_files): - """Test main function with custom command.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "custom", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-label", "h200", - "--image", "test:latest", - "--model", "test/model", - "--framework", "vllm", - "--precision", "fp8", - "--exp-name", "custom_test" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) == 1 - - -def test_main_invalid_config_structure(tmp_path): - """Test main with invalid config structure.""" - invalid_file = tmp_path / "invalid.yaml" - with open(invalid_file, 'w') as f: - yaml.dump({"key": {"image": "test"}}, f) # Missing required fields - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", str(invalid_file), - "--seq-lens", "1k1k", - "--model-prefix", "test" - ] - - with patch('sys.argv', test_args): - with pytest.raises(ValueError): - main() - - -def test_main_validation_failure(temp_config_files, monkeypatch): - """Test main with validation failure on output.""" - master_file, _ = temp_config_files - - # Monkey patch validate_matrix_output to always fail - def mock_validate(entries): - raise ValueError("Validation failed") - - monkeypatch.setattr('generate_sweep_configs.validate_matrix_output', mock_validate) - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", master_file, - "--seq-lens", "1k1k", - "--model-prefix", "70b" - ] - - with patch('sys.argv', test_args): - with pytest.raises(ValueError, match="Validation failed"): - main() - - + def test_runner_node_filter_multiple_matches(self, sample_single_node_config, sample_runner_config): + args = MockArgs(runner_type="h200", runner_node_filter="nv") + result = generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) + runners = set(entry['runner'] for entry in result) + assert 'h200-nv_1' in runners + assert 'h200-nv_2' in runners + + def test_runner_node_filter_no_matches(self, sample_single_node_config, sample_runner_config): + args = MockArgs(runner_type="h200", runner_node_filter="nonexistent") + with pytest.raises(ValueError, match="No runner nodes found matching filter"): + generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) + + def test_uses_highest_tp_lowest_conc(self, sample_single_node_config, sample_runner_config): + """Test that it uses highest TP with lowest concurrency.""" + args = MockArgs(runner_type="h200") + result = generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) + # dsr1 config has tp=4 (conc 4-64) and tp=8 (conc 4-64), should pick tp=8, conc=4 + for entry in result: + assert entry['tp'] == 8 + assert entry['conc'] == 4 + + def test_always_uses_1k1k(self, sample_single_node_config, sample_runner_config): + """Test that it always uses 1k1k sequence lengths.""" + args = MockArgs(runner_type="h200") + result = generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) + assert all(entry['isl'] == 1024 and entry['osl'] == 1024 for entry in result) + + def test_exp_name_has_test_suffix(self, sample_single_node_config, sample_runner_config): + """Test that exp-name has _test suffix.""" + args = MockArgs(runner_type="h200") + result = generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) + assert all('_test' in entry['exp-name'] for entry in result) + + +# ============================================================================ +# Tests for main function +# ============================================================================ + +class TestMain: + """Tests for the main function with CLI argument parsing.""" + + def test_main_full_sweep_single_node(self, temp_config_files): + master_file, runner_file = temp_config_files + + test_args = [ + "generate_sweep_configs.py", + "full-sweep", + "--config-files", master_file, + "--runner-config", runner_file, + "--seq-lens", "1k1k", + "--model-prefix", "dsr1", + "--step-size", "2", + "--single-node" + ] + + with patch('sys.argv', test_args): + result = main() + assert len(result) > 0 + + def test_main_full_sweep_multi_node(self, temp_multinode_config_files): + master_file, runner_file = temp_multinode_config_files + + test_args = [ + "generate_sweep_configs.py", + "full-sweep", + "--config-files", master_file, + "--runner-config", runner_file, + "--multi-node" + ] + + with patch('sys.argv', test_args): + result = main() + assert len(result) > 0 + + def test_main_full_sweep_with_filters(self, temp_config_files): + master_file, runner_file = temp_config_files + + test_args = [ + "generate_sweep_configs.py", + "full-sweep", + "--config-files", master_file, + "--runner-config", runner_file, + "--model-prefix", "dsr1", + "--precision", "fp8", + "--framework", "sglang", + "--single-node" + ] + + with patch('sys.argv', test_args): + result = main() + assert len(result) > 0 + assert all(entry['precision'] == 'fp8' for entry in result) + assert all(entry['framework'] == 'sglang' for entry in result) + + def test_main_full_sweep_empty_result(self, temp_config_files): + """Test that empty results are returned without error.""" + master_file, runner_file = temp_config_files + + test_args = [ + "generate_sweep_configs.py", + "full-sweep", + "--config-files", master_file, + "--runner-config", runner_file, + "--model-prefix", "nonexistent", + "--single-node" + ] + + with patch('sys.argv', test_args): + result = main() + assert result == [] + + def test_main_runner_model_sweep(self, temp_config_files): + master_file, runner_file = temp_config_files + + test_args = [ + "generate_sweep_configs.py", + "runner-model-sweep", + "--config-files", master_file, + "--runner-config", runner_file, + "--runner-type", "h200" + ] + + with patch('sys.argv', test_args): + result = main() + assert len(result) > 0 + + def test_main_runner_model_sweep_with_filter(self, temp_config_files): + master_file, runner_file = temp_config_files + + test_args = [ + "generate_sweep_configs.py", + "runner-model-sweep", + "--config-files", master_file, + "--runner-config", runner_file, + "--runner-type", "h200", + "--runner-node-filter", "nv_1" + ] + + with patch('sys.argv', test_args): + result = main() + assert len(result) > 0 + runners = set(entry['runner'] for entry in result) + assert 'h200-nv_1' in runners + assert 'h200-nv_2' not in runners + + +# ============================================================================ # Edge case tests -def test_concurrency_step_reaches_exact_end(sample_master_config, temp_config_files): - """Test that concurrency stepping reaches exact end value.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["8b"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # conc-start=4, conc-end=16, step=2 should give 4,8,16 - conc_values = sorted(set(e['conc'] for e in result)) - assert 16 in conc_values - - -def test_multiple_model_prefixes_filtered_sweep(sample_master_config, temp_config_files): - """Test filtered sweep with multiple model prefixes.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b", "8b"] - precision = None - framework = None - runner_type = None - seq_lens = ["1k1k"] - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - exp_names = [e['exp-name'] for e in result] - assert any('70b' in name for name in exp_names) - assert any('8b' in name for name in exp_names) - - -def test_seq_len_filter_multiple(sample_master_config, temp_config_files): - """Test filtering with multiple sequence lengths.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - precision = None - framework = None - runner_type = None - seq_lens = ["1k1k", "1k8k"] - step_size = 2 - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - seq_lens = set((e['isl'], e['osl']) for e in result) - assert (1024, 1024) in seq_lens - assert (1024, 8192) in seq_lens - - -def test_default_ep_dp_attn_values(sample_master_config, temp_config_files): - """Test that default ep and dp-attn values are set correctly.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["8b"] - seq_lens = ["1k1k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # 8b config doesn't specify ep/dp-attn, so should use defaults - assert all(e['ep'] == 1 for e in result) - assert all(e['dp-attn'] == False for e in result) - - -def test_max_model_len_calculation(sample_master_config, temp_config_files): - """Test that max-model-len is calculated correctly.""" - _, runner_file = temp_config_files - - class Args: - model_prefix = ["70b"] - seq_lens = ["1k8k"] - step_size = 2 - precision = None - framework = None - runner_type = None - test_mode = False - runner_config = runner_file - - result = generate_full_sweep(Args(), sample_master_config) - # isl=1024, osl=8192, so max-model-len should be 1024+8192+200=9416 - assert all(e['max-model-len'] == 9416 for e in result) +# ============================================================================ + +class TestEdgeCases: + """Edge case tests.""" + + def test_concurrency_range_equals_start_end(self, sample_runner_config): + """Test when conc-start equals conc-end.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 4, "conc-start": 8, "conc-end": 8} + ] + } + ] + } + } + args = MockArgs(single_node=True) + result = generate_full_sweep(args, config, sample_runner_config) + assert len(result) == 1 + assert result[0]['conc'] == 8 + + def test_multiple_model_prefixes(self, sample_single_node_config, sample_runner_config): + """Test filtering with multiple model prefixes.""" + args = MockArgs( + model_prefix=["dsr1", "gptoss"], + seq_lens=["1k1k"], + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + exp_names = [e['exp-name'] for e in result] + assert any('dsr1' in name for name in exp_names) + assert any('gptoss' in name for name in exp_names) + + def test_combined_max_filters(self, sample_single_node_config, sample_runner_config): + """Test combining max_tp, max_ep, and max_conc filters.""" + args = MockArgs( + model_prefix=["dsr1"], + seq_lens=["1k1k"], + max_tp=4, + max_conc=8, + single_node=True + ) + result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + assert all(entry['tp'] <= 4 for entry in result) + assert all(entry['conc'] <= 8 for entry in result) + + def test_multinode_conc_range_instead_of_list(self, sample_runner_config): + """Test multinode config with conc-start/conc-end instead of conc-list.""" + config = { + "multinode-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "runner": "gb200", + "precision": "fp8", + "framework": "dynamo-trt", + "multinode": True, + "disagg": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-start": 4, + "conc-end": 16, + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": False + } + } + ] + } + ] + } + } + args = MockArgs(multi_node=True, step_size=2) + result = generate_full_sweep(args, config, sample_runner_config) + assert len(result) == 1 + # conc should be [4, 8, 16] + assert result[0]['conc'] == [4, 8, 16] if __name__ == "__main__": - pytest.main([__file__, "-v", "--cov=generate_sweep_configs", "--cov-report=term-missing"]) + pytest.main([__file__, "-v"]) diff --git a/utils/matrix-logic/test_validation.py b/utils/matrix-logic/test_validation.py new file mode 100644 index 000000000..6a9dc9683 --- /dev/null +++ b/utils/matrix-logic/test_validation.py @@ -0,0 +1,1282 @@ +import pytest +from validation import ( + validate_master_config, + validate_matrix_output, + validate_runner_config, + Fields, + SingleNodeMatrixEntry, + MultiNodeMatrixEntry, + WorkerConfig, + SingleNodeSearchSpaceEntry, + MultiNodeSearchSpaceEntry, + SingleNodeSeqLenConfig, + MultiNodeSeqLenConfig, + SingleNodeMasterConfigEntry, + MultiNodeMasterConfigEntry, +) + + +# ============================================================================ +# Tests for validate_master_config - Single Node +# ============================================================================ + +class TestValidateMasterConfigSingleNode: + """Tests for validate_master_config with single-node configurations.""" + + def test_valid_single_node_config(self): + """Test validation of a valid single-node config.""" + config = { + "test-fp8-h200-vllm": { + "image": "vllm/vllm-openai:v0.11.0", + "model": "meta-llama/Llama-3-70b", + "model-prefix": "llama70b", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 4, "conc-start": 4, "conc-end": 64}, + {"tp": 8, "conc-start": 4, "conc-end": 64, "ep": 2, "dp-attn": True} + ] + } + ] + } + } + result = validate_master_config(config) + assert result == config + + def test_valid_single_node_with_disagg(self): + """Test validation with disagg field.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "disagg": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 4, "conc-start": 4, "conc-end": 64} + ] + } + ] + } + } + result = validate_master_config(config) + assert result == config + + def test_valid_single_node_with_conc_list(self): + """Test validation with conc-list instead of conc-start/end.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 4, "conc-list": [1, 2, 4, 8, 16]} + ] + } + ] + } + } + result = validate_master_config(config) + assert result == config + + def test_missing_required_field_image(self): + """Test validation fails when image is missing.""" + config = { + "test-config": { + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_missing_required_field_model(self): + """Test validation fails when model is missing.""" + config = { + "test-config": { + "image": "test:latest", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_missing_required_field_model_prefix(self): + """Test validation fails when model-prefix is missing.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_missing_required_field_precision(self): + """Test validation fails when precision is missing.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_missing_required_field_framework(self): + """Test validation fails when framework is missing.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_missing_required_field_runner(self): + """Test validation fails when runner is missing.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_missing_required_field_multinode(self): + """Test validation fails when multinode is missing.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_missing_required_field_seq_len_configs(self): + """Test validation fails when seq-len-configs is missing.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_wrong_type_image(self): + """Test validation fails when image has wrong type.""" + config = { + "test-config": { + "image": 123, + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_empty_seq_len_configs(self): + """Test that empty seq-len-configs is allowed by validation. + + Note: Pydantic allows empty lists by default. This may produce + no output at runtime but is not a validation error. + """ + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [] + } + } + # This is allowed - Pydantic doesn't enforce non-empty lists by default + result = validate_master_config(config) + assert result == config + + def test_missing_isl_in_seq_len_config(self): + """Test validation fails when isl is missing.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "osl": 1024, + "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_missing_osl_in_seq_len_config(self): + """Test validation fails when osl is missing.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_missing_search_space(self): + """Test validation fails when search-space is missing.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024 + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_empty_search_space(self): + """Test that empty search-space is allowed by validation. + + Note: Pydantic allows empty lists by default. This may produce + no output at runtime but is not a validation error. + """ + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [] + } + ] + } + } + # This is allowed - Pydantic doesn't enforce non-empty lists by default + result = validate_master_config(config) + assert result == config + + def test_missing_tp_in_search_space(self): + """Test validation fails when tp is missing in search-space.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_missing_both_conc_range_and_list(self): + """Test validation fails when neither conc-start/end nor conc-list is provided.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_both_conc_range_and_list_provided(self): + """Test validation fails when both conc-start/end and conc-list are provided.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 4, "conc-start": 4, "conc-end": 64, "conc-list": [1, 2, 4]} + ] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_conc_start_greater_than_end(self): + """Test validation fails when conc-start > conc-end.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4, "conc-start": 64, "conc-end": 4}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_conc_list_with_zero_value(self): + """Test validation fails when conc-list contains zero.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4, "conc-list": [0, 1, 2]}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_wrong_type_tp(self): + """Test validation fails when tp has wrong type.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": "four", "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_wrong_type_ep(self): + """Test validation fails when ep has wrong type.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4, "ep": "two", "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_wrong_type_dp_attn(self): + """Test validation fails when dp-attn has a truly invalid type. + + Note: Pydantic coerces some string values to bools (e.g., "yes" -> True). + We test with a value that cannot be coerced. + """ + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4, "dp-attn": [1, 2, 3], "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_extra_field_in_top_level(self): + """Test validation fails when extra field is present at top level.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "extra-field": "not-allowed", + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_extra_field_in_search_space(self): + """Test validation fails when extra field is present in search-space.""" + config = { + "test-config": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 4, "conc-start": 4, "conc-end": 64, "invalid-field": "value"} + ] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + +# ============================================================================ +# Tests for validate_master_config - Multi Node +# ============================================================================ + +class TestValidateMasterConfigMultiNode: + """Tests for validate_master_config with multinode configurations.""" + + def test_valid_multinode_config(self): + """Test validation of a valid multinode config.""" + config = { + "test-multinode": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "disagg": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "spec-decoding": "mtp", + "conc-list": [1, 2, 4, 8], + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + "additional-settings": ["PREFILL_MAX_NUM_TOKENS=4608"] + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False, + "additional-settings": ["DECODE_MAX_NUM_TOKENS=128"] + } + } + ] + } + ] + } + } + result = validate_master_config(config) + assert result == config + + def test_valid_multinode_with_conc_range(self): + """Test validation of multinode config with conc-start/conc-end.""" + config = { + "test-multinode": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-start": 4, + "conc-end": 64, + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False + } + } + ] + } + ] + } + } + result = validate_master_config(config) + assert result == config + + def test_multinode_missing_prefill(self): + """Test validation fails when prefill is missing in multinode config.""" + config = { + "test-multinode": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-list": [1, 2, 4], + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False + } + } + ] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_multinode_missing_decode(self): + """Test validation fails when decode is missing in multinode config.""" + config = { + "test-multinode": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-list": [1, 2, 4], + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False + } + } + ] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_multinode_invalid_spec_decoding(self): + """Test validation fails when spec-decoding has invalid value.""" + config = { + "test-multinode": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "spec-decoding": "invalid-value", + "conc-list": [1, 2, 4], + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False + } + } + ] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_multinode_worker_config_missing_num_worker(self): + """Test validation fails when num-worker is missing in worker config.""" + config = { + "test-multinode": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-list": [1, 2, 4], + "prefill": { + "tp": 4, + "ep": 4, + "dp-attn": False + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False + } + } + ] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + def test_multinode_worker_config_missing_tp(self): + """Test validation fails when tp is missing in worker config.""" + config = { + "test-multinode": { + "image": "test:latest", + "model": "test/model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-list": [1, 2, 4], + "prefill": { + "num-worker": 1, + "ep": 4, + "dp-attn": False + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False + } + } + ] + } + ] + } + } + with pytest.raises(ValueError, match="failed validation"): + validate_master_config(config) + + +# ============================================================================ +# Tests for validate_matrix_output - Single Node +# ============================================================================ + +class TestValidateMatrixOutputSingleNode: + """Tests for validate_matrix_output with single-node entries.""" + + def test_valid_single_node_entry(self): + """Test validation of a valid single-node matrix entry.""" + entry = { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "isl": 1024, + "osl": 1024, + "tp": 8, + "ep": 1, + "dp-attn": False, + "conc": 4, + "max-model-len": 2248, + "exp-name": "test_1k1k", + "disagg": False + } + result = validate_matrix_output(entry, is_multinode=False) + assert result == entry + + def test_single_node_missing_field(self): + """Test validation fails when required field is missing.""" + entry = { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "isl": 1024, + "osl": 1024, + # Missing tp + "ep": 1, + "dp-attn": False, + "conc": 4, + "max-model-len": 2248, + "exp-name": "test_1k1k", + "disagg": False + } + with pytest.raises(ValueError, match="failed validation"): + validate_matrix_output(entry, is_multinode=False) + + def test_single_node_wrong_type(self): + """Test validation fails when field has wrong type.""" + entry = { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "isl": "not-an-int", + "osl": 1024, + "tp": 8, + "ep": 1, + "dp-attn": False, + "conc": 4, + "max-model-len": 2248, + "exp-name": "test_1k1k", + "disagg": False + } + with pytest.raises(ValueError, match="failed validation"): + validate_matrix_output(entry, is_multinode=False) + + def test_single_node_extra_field(self): + """Test validation fails when extra field is present.""" + entry = { + "image": "test:latest", + "model": "test/model", + "precision": "fp8", + "framework": "vllm", + "runner": "h200", + "isl": 1024, + "osl": 1024, + "tp": 8, + "ep": 1, + "dp-attn": False, + "conc": 4, + "max-model-len": 2248, + "exp-name": "test_1k1k", + "disagg": False, + "extra-field": "not-allowed" + } + with pytest.raises(ValueError, match="failed validation"): + validate_matrix_output(entry, is_multinode=False) + + +# ============================================================================ +# Tests for validate_matrix_output - Multi Node +# ============================================================================ + +class TestValidateMatrixOutputMultiNode: + """Tests for validate_matrix_output with multinode entries.""" + + def test_valid_multinode_entry(self): + """Test validation of a valid multinode matrix entry.""" + entry = { + "image": "test:latest", + "model": "test/model", + "precision": "fp4", + "framework": "dynamo-trt", + "spec-decoding": "mtp", + "runner": "gb200", + "isl": 1024, + "osl": 1024, + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False + }, + "conc": [1, 2, 4, 8], + "max-model-len": 2248, + "exp-name": "test_1k1k", + "disagg": True + } + result = validate_matrix_output(entry, is_multinode=True) + assert result == entry + + def test_multinode_missing_spec_decoding(self): + """Test validation fails when spec-decoding is missing.""" + entry = { + "image": "test:latest", + "model": "test/model", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "isl": 1024, + "osl": 1024, + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False + }, + "conc": [1, 2, 4, 8], + "max-model-len": 2248, + "exp-name": "test_1k1k", + "disagg": True + } + with pytest.raises(ValueError, match="failed validation"): + validate_matrix_output(entry, is_multinode=True) + + def test_multinode_conc_not_list(self): + """Test validation fails when conc is not a list in multinode entry.""" + entry = { + "image": "test:latest", + "model": "test/model", + "precision": "fp4", + "framework": "dynamo-trt", + "spec-decoding": "mtp", + "runner": "gb200", + "isl": 1024, + "osl": 1024, + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False + }, + "conc": 4, # Should be a list + "max-model-len": 2248, + "exp-name": "test_1k1k", + "disagg": True + } + with pytest.raises(ValueError, match="failed validation"): + validate_matrix_output(entry, is_multinode=True) + + +# ============================================================================ +# Tests for validate_runner_config +# ============================================================================ + +class TestValidateRunnerConfig: + """Tests for validate_runner_config function.""" + + def test_valid_runner_config(self): + """Test validation of a valid runner config.""" + config = { + "h200": ["h200-nv_1", "h200-nv_2"], + "b200": ["b200-nv_1"], + "gb200": ["gb200-nv_1", "gb200-nv_2", "gb200-nv_3"] + } + result = validate_runner_config(config) + assert result == config + + def test_runner_config_value_not_list(self): + """Test validation fails when runner value is not a list.""" + config = { + "h200": "h200-nv_1" # Should be a list + } + with pytest.raises(ValueError, match="must be a list"): + validate_runner_config(config) + + def test_runner_config_list_not_strings(self): + """Test validation fails when list contains non-strings.""" + config = { + "h200": ["h200-nv_1", 123] # Contains non-string + } + with pytest.raises(ValueError, match="must contain only strings"): + validate_runner_config(config) + + def test_runner_config_empty_list(self): + """Test validation fails when runner list is empty.""" + config = { + "h200": [] # Empty list + } + with pytest.raises(ValueError, match="cannot be an empty list"): + validate_runner_config(config) + + +# ============================================================================ +# Tests for Pydantic Models - Unit Tests +# ============================================================================ + +class TestWorkerConfigModel: + """Tests for WorkerConfig Pydantic model.""" + + def test_valid_worker_config(self): + """Test valid WorkerConfig.""" + config = WorkerConfig(**{ + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False, + "additional-settings": ["SETTING1=value1"] + }) + assert config.num_worker == 4 + assert config.tp == 8 + + def test_worker_config_without_additional_settings(self): + """Test WorkerConfig without additional-settings.""" + config = WorkerConfig(**{ + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False + }) + assert config.additional_settings is None + + +class TestSingleNodeSearchSpaceEntry: + """Tests for SingleNodeSearchSpaceEntry Pydantic model.""" + + def test_valid_with_range(self): + """Test valid entry with conc-start/conc-end.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "conc-start": 4, + "conc-end": 64 + }) + assert entry.tp == 4 + assert entry.conc_start == 4 + assert entry.conc_end == 64 + + def test_valid_with_list(self): + """Test valid entry with conc-list.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "conc-list": [1, 2, 4, 8] + }) + assert entry.tp == 4 + assert entry.conc_list == [1, 2, 4, 8] + + def test_valid_with_optional_fields(self): + """Test valid entry with optional fields.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "ep": 2, + "dp-attn": True, + "spec-decoding": "mtp", + "conc-start": 4, + "conc-end": 64 + }) + assert entry.ep == 2 + assert entry.dp_attn == True + assert entry.spec_decoding == "mtp" + + +class TestMultiNodeSearchSpaceEntry: + """Tests for MultiNodeSearchSpaceEntry Pydantic model.""" + + def test_valid_multinode_entry(self): + """Test valid multinode search-space entry.""" + entry = MultiNodeSearchSpaceEntry(**{ + "conc-list": [1, 2, 4], + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False + }, + "decode": { + "num-worker": 4, + "tp": 8, + "ep": 8, + "dp-attn": False + } + }) + assert entry.prefill.num_worker == 1 + assert entry.decode.num_worker == 4 + + +# ============================================================================ +# Tests for Fields Enum +# ============================================================================ + +class TestFieldsEnum: + """Tests for the Fields enum.""" + + def test_field_values(self): + """Test that Fields enum has expected values.""" + assert Fields.IMAGE.value == 'image' + assert Fields.MODEL.value == 'model' + assert Fields.MODEL_PREFIX.value == 'model-prefix' + assert Fields.PRECISION.value == 'precision' + assert Fields.FRAMEWORK.value == 'framework' + assert Fields.RUNNER.value == 'runner' + assert Fields.SEQ_LEN_CONFIGS.value == 'seq-len-configs' + assert Fields.MULTINODE.value == 'multinode' + assert Fields.ISL.value == 'isl' + assert Fields.OSL.value == 'osl' + assert Fields.SEARCH_SPACE.value == 'search-space' + assert Fields.TP.value == 'tp' + assert Fields.EP.value == 'ep' + assert Fields.CONC_START.value == 'conc-start' + assert Fields.CONC_END.value == 'conc-end' + assert Fields.CONC_LIST.value == 'conc-list' + assert Fields.DP_ATTN.value == 'dp-attn' + assert Fields.DISAGG.value == 'disagg' + + +if __name__ == "__main__": + pytest.main([__file__, "-v"]) diff --git a/utils/matrix-logic/validation.py b/utils/matrix-logic/validation.py index 16696c888..25f83cc38 100644 --- a/utils/matrix-logic/validation.py +++ b/utils/matrix-logic/validation.py @@ -55,8 +55,7 @@ class SingleNodeMatrixEntry(BaseModel): model: str precision: str framework: str - spec_decoding: Optional[Literal["mtp", "draft_model", "none"]] = Field( - default=None, + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( alias=Fields.SPEC_DECODING.value ) runner: str @@ -171,8 +170,8 @@ class SingleNodeSearchSpaceEntry(BaseModel): tp: int ep: Optional[int] = None - spec_decoding: Optional[Literal["mtp", "draft_model", "none"] - ] = Field(default=None, alias=Fields.SPEC_DECODING.value) + spec_decoding: Literal["mtp", "draft_model", "none"] = Field( + default="none", alias=Fields.SPEC_DECODING.value) dp_attn: Optional[bool] = Field( default=None, alias=Fields.DP_ATTN.value) conc_start: Optional[int] = Field( @@ -192,8 +191,7 @@ class MultiNodeSearchSpaceEntry(BaseModel): model_config = ConfigDict(extra='forbid', populate_by_name=True) spec_decoding: Literal["mtp", "draft_model", "none"] = Field( - default="none", - alias=Fields.SPEC_DECODING.value) + default="none", alias=Fields.SPEC_DECODING.value) prefill: WorkerConfig decode: WorkerConfig conc_start: Optional[int] = Field( @@ -277,19 +275,20 @@ def validate_master_config(master_configs: dict) -> List[dict]: # Runner Config Validation + def validate_runner_config(runner_configs: dict) -> List[dict]: """Validate input master configuration structure.""" for key, value in runner_configs.items(): if not isinstance(value, list): raise ValueError( f"Runner config entry '{key}' must be a list, got {type(value).__name__}") - + if not all(isinstance(item, str) for item in value): raise ValueError( f"Runner config entry '{key}' must contain only strings") - + if not value: raise ValueError( f"Runner config entry '{key}' cannot be an empty list") - + return runner_configs From 0e44f467798b7e7d7e5d364fe6f68bd888dc5888 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sun, 30 Nov 2025 21:49:44 -0600 Subject: [PATCH 64/98] adding testing workflows --- .github/workflows/full-sweep-1k1k-scheduler.yml | 4 ++++ .github/workflows/full-sweep-1k8k-scheduler.yml | 4 ++++ .github/workflows/full-sweep-8k1k-scheduler.yml | 4 ++++ 3 files changed, 12 insertions(+) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index db8da19fd..38132ceab 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -98,6 +98,8 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} benchmark-gptoss-multi-node: needs: get-gptoss-configs @@ -159,6 +161,8 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} collect-dsr1-results: needs: diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index 29963c44a..8f0571eea 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -98,6 +98,8 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} benchmark-gptoss-multi-node: needs: get-gptoss-configs @@ -159,6 +161,8 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} collect-dsr1-results: needs: diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index 32b2e47dc..3a59c9987 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -98,6 +98,8 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} benchmark-gptoss-multi-node: needs: get-gptoss-configs @@ -159,6 +161,8 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} collect-dsr1-results: needs: From 8beb869fa4f6f60d75f8f79a53bd720bffe40bd9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 1 Dec 2025 09:38:28 -0600 Subject: [PATCH 65/98] adding testing workflows --- .github/workflows/e2e-tests.yml | 2 + .github/workflows/full-sweep-test.yml | 445 ---- .github/workflows/test-matrix-logic.yml | 1 + utils/matrix-logic/generate_sweep_configs.py | 10 +- .../test_generate_sweep_configs.py | 1303 ++++++------ utils/matrix-logic/test_validation.py | 1854 ++++++----------- utils/matrix-logic/validation.py | 27 +- utils/scrape_image_tag.py | 43 - utils/summarize.py | 9 +- utils/test_process_result.py | 150 -- 10 files changed, 1312 insertions(+), 2532 deletions(-) delete mode 100644 .github/workflows/full-sweep-test.yml delete mode 100644 utils/scrape_image_tag.py delete mode 100644 utils/test_process_result.py diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 127db58ec..4158c7a38 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -90,6 +90,8 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} collect-results: needs: [test-sweep-multi-node, test-sweep-single-node] diff --git a/.github/workflows/full-sweep-test.yml b/.github/workflows/full-sweep-test.yml deleted file mode 100644 index f9664be19..000000000 --- a/.github/workflows/full-sweep-test.yml +++ /dev/null @@ -1,445 +0,0 @@ -name: Test - Full Sweep - -on: - workflow_dispatch: - inputs: - run_1k1k: - type: boolean - required: false - run_8k1k: - type: boolean - required: false - run_1k8k: - type: boolean - required: false - use_h100: - type: boolean - required: false - use_h200: - type: boolean - required: false - use_b200: - type: boolean - required: false - use_mi300x: - type: boolean - required: false - use_mi325x: - type: boolean - required: false - use_mi355x: - type: boolean - required: false - use_gb200: - type: boolean - required: false - -jobs: - get-configs: - runs-on: ubuntu-latest - outputs: - dsr1-1k1k: ${{ steps.generate-configs.outputs.dsr1-1k1k }} - dsr1-1k8k: ${{ steps.generate-configs.outputs.dsr1-1k8k }} - dsr1-8k1k: ${{ steps.generate-configs.outputs.dsr1-8k1k }} - gptoss-1k1k: ${{ steps.generate-configs.outputs.gptoss-1k1k }} - gptoss-1k8k: ${{ steps.generate-configs.outputs.gptoss-1k8k }} - gptoss-8k1k: ${{ steps.generate-configs.outputs.gptoss-8k1k }} - steps: - - name: Checkout code - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - - # This looks complicated, but it is just calling generate_sweep_configs.py conditioned on - # discrete inputs (i.e., run_1k1k, run_h100, etc.) to split the test sweep into discrete jobs - - id: generate-configs - run: | - pip install pydantic - - set -x - # Build runner type filters based on inputs - RUNNER_TYPES="${{ inputs.use_h100 && 'h100' || '' }} ${{ inputs.use_h200 && 'h200' || '' }} ${{ inputs.use_b200 && 'b200 b200-trt' || '' }} ${{ inputs.use_mi300x && 'mi300x' || '' }} ${{ inputs.use_mi325x && 'mi325x' || '' }} ${{ inputs.use_mi355x && 'mi355x' || '' }}" - - # DSR1 doesn't support H100, so exclude it - DSR1_RUNNER_TYPES=$(echo $RUNNER_TYPES | sed 's/\bh100\b//g' | xargs) - - # Generate dsr1 configs (only if we have valid runner types for DSR1) - if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "dsr1-1k1k=$DSR1_1K1K" >> $GITHUB_OUTPUT - else - echo "dsr1-1k1k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "dsr1-1k8k=$DSR1_1K8K" >> $GITHUB_OUTPUT - else - echo "dsr1-1k8k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$DSR1_RUNNER_TYPES" ]; then - DSR1_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix dsr1 --runner-type $DSR1_RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "dsr1-8k1k=$DSR1_8K1K" >> $GITHUB_OUTPUT - else - echo "dsr1-8k1k=[]" >> $GITHUB_OUTPUT - fi - - # Generate gptoss configs (only if we have runner types selected) - if [ "${{ inputs.run_1k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "gptoss-1k1k=$GPTOSS_1K1K" >> $GITHUB_OUTPUT - else - echo "gptoss-1k1k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_1k8k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_1K8K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 1k8k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "gptoss-1k8k=$GPTOSS_1K8K" >> $GITHUB_OUTPUT - else - echo "gptoss-1k8k=[]" >> $GITHUB_OUTPUT - fi - - if [ "${{ inputs.run_8k1k }}" = "true" ] && [ -n "$RUNNER_TYPES" ]; then - GPTOSS_8K1K=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --seq-lens 8k1k --model-prefix gptoss --runner-type $RUNNER_TYPES --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - echo "gptoss-8k1k=$GPTOSS_8K1K" >> $GITHUB_OUTPUT - else - echo "gptoss-8k1k=[]" >> $GITHUB_OUTPUT - fi - - # DSR1 1K1K Benchmarks - benchmark-dsr1-1k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.dsr1-1k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 1k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-dsr1-1k1k-results: - needs: benchmark-dsr1-1k1k - if: ${{ always() && needs.get-configs.outputs.dsr1-1k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "dsr1_1k1k" - - # GPTOSS 1K1K Benchmarks - benchmark-gptoss-1k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.gptoss-1k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-gptoss-1k1k-results: - needs: benchmark-gptoss-1k1k - if: ${{ always() && needs.get-configs.outputs.gptoss-1k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "gptoss_1k1k" - - # DSR1 8K1K Benchmarks - benchmark-dsr1-8k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.dsr1-8k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 8k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.dsr1-8k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-dsr1-8k1k-results: - needs: benchmark-dsr1-8k1k - if: ${{ always() && needs.get-configs.outputs.dsr1-8k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "dsr1_8k1k" - - # GPTOSS 8K1K Benchmarks - benchmark-gptoss-8k1k: - needs: get-configs - if: ${{ needs.get-configs.outputs.gptoss-8k1k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 8k1k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.gptoss-8k1k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-gptoss-8k1k-results: - needs: benchmark-gptoss-8k1k - if: ${{ always() && needs.get-configs.outputs.gptoss-8k1k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "gptoss_8k1k" - - # DSR1 1K8K Benchmarks - benchmark-dsr1-1k8k: - needs: get-configs - if: ${{ needs.get-configs.outputs.dsr1-1k8k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: dsr1 1k8k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.dsr1-1k8k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - # This is a workaround until we can integrate GB200 into master configs. - benchmark-gb200-1k1k: - if: ${{ inputs.use_gb200 && inputs.run_1k1k }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k1k sweep - strategy: - fail-fast: false - matrix: - config: &dsr1_static_configs - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "off", - } - - { - "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", - "model": "deepseek-r1-fp4", - "model-prefix": "dsr1", - "precision": "fp4", - "framework": "dynamo-trtllm", - "mtp": "on", - } - - { - "image": "nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1", - "model": "deepseek-ai/DeepSeek-R1-0528", - "model-prefix": "dsr1", - "precision": "fp8", - "framework": "dynamo-sglang", - "mtp": "off", - } - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_1k1k - isl: 1024 - osl: 1024 - max-model-len: 2048 - mtp-mode: ${{ matrix.config.mtp }} - - benchmark-gb200-1k8k: - if: ${{ inputs.use_gb200 && inputs.run_1k8k }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 1k8k sweep - strategy: - fail-fast: false - matrix: - config: *dsr1_static_configs - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_1k8k - isl: 1024 - osl: 8192 - max-model-len: 9216 - mtp-mode: ${{ matrix.config.mtp }} - - benchmark-gb200-8k1k: - if: ${{ inputs.use_gb200 && inputs.run_8k1k }} - uses: ./.github/workflows/benchmark-multinode-tmpl.yml - name: gb200 8k1k sweep - strategy: - fail-fast: false - matrix: - config: *dsr1_static_configs - secrets: inherit - with: - runner: gb200 - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - exp-name: ${{ matrix.config.model-prefix }}_8k1k - isl: 1024 - osl: 8192 - max-model-len: 9216 - mtp-mode: ${{ matrix.config.mtp }} - - collect-dsr1-1k8k-results: - needs: - [ - benchmark-dsr1-1k8k, - benchmark-gb200-1k1k, - benchmark-gb200-1k8k, - benchmark-gb200-8k1k, - ] - if: ${{ always() && needs.get-configs.outputs.dsr1-1k8k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "dsr1_1k8k" - - # GPTOSS 1K8K Benchmarks - benchmark-gptoss-1k8k: - needs: get-configs - if: ${{ needs.get-configs.outputs.gptoss-1k8k != '[]' }} - uses: ./.github/workflows/benchmark-tmpl.yml - name: gptoss 1k8k / - strategy: - fail-fast: false - matrix: - config: ${{ fromJson(needs.get-configs.outputs.gptoss-1k8k) }} - secrets: inherit - with: - exp-name: ${{ matrix.config.exp-name }} - isl: ${{ matrix.config.isl }} - osl: ${{ matrix.config.osl }} - max-model-len: ${{ matrix.config.max-model-len }} - runner: ${{ matrix.config.runner }} - image: ${{ matrix.config.image }} - model: ${{ matrix.config.model }} - framework: ${{ matrix.config.framework }} - precision: ${{ matrix.config.precision }} - tp: ${{ matrix.config.tp }} - ep: ${{ matrix.config.ep }} - dp-attn: ${{ matrix.config.dp-attn }} - conc: ${{ matrix.config.conc }} - - collect-gptoss-1k8k-results: - needs: benchmark-gptoss-1k8k - if: ${{ always() && needs.get-configs.outputs.gptoss-1k8k != '[]' }} - uses: ./.github/workflows/collect-results.yml - secrets: inherit - with: - exp-name: "gptoss_1k8k" - - calc-success-rate: - needs: - [ - collect-dsr1-1k1k-results, - collect-dsr1-1k8k-results, - collect-dsr1-8k1k-results, - collect-gptoss-1k1k-results, - collect-gptoss-1k8k-results, - collect-gptoss-8k1k-results, - ] - if: ${{ always() }} - runs-on: ubuntu-latest - - env: - RESULTS_DIR: "results/" - STATS_FILENAME: "run_stats" - GITHUB_TOKEN: ${{ secrets.REPO_PAT }} - - steps: - - uses: actions/checkout@1af3b93b6815bc44a9784bd300feb67ff0d1eeb3 # v6.0.0 - with: - token: ${{ secrets.REPO_PAT }} - fetch-depth: 0 - - - name: Download results artifacts - uses: actions/download-artifact@018cc2cf5baa6db3ef3c5f8a56943fffe632ef53 # v6.0.0 - with: - path: ${{ env.RESULTS_DIR }} - pattern: results_* - - - name: Install python dependencies - run: pip install PyGithub - - - name: Calculate success rate - run: python3 utils/calc_success_rate.py $STATS_FILENAME - - - uses: actions/upload-artifact@330a01c490aca151604b8cf639adc76d48f6c5d4 # v5.0.0 - with: - name: "run-stats" - path: ${{ env.STATS_FILENAME }}.json diff --git a/.github/workflows/test-matrix-logic.yml b/.github/workflows/test-matrix-logic.yml index 34d650b38..d31efdfe1 100644 --- a/.github/workflows/test-matrix-logic.yml +++ b/.github/workflows/test-matrix-logic.yml @@ -33,3 +33,4 @@ jobs: run: | cd utils/matrix-logic pytest test_generate_sweep_configs.py -v + pytest test_validation.py -v diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index be87e5e25..ca9ce6291 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -2,7 +2,7 @@ import yaml import argparse -from validation import validate_master_config, validate_matrix_output, validate_runner_config, Fields +from validation import validate_master_config, validate_matrix_entry, validate_runner_config, Fields seq_len_stoi = { "1k1k": (1024, 1024), @@ -31,7 +31,7 @@ def generate_full_sweep(args, all_config_data, runner_data): All filters are optional - can generate sweeps for all configs or filter by specific criteria. - Assumes all_config_data has been validated by validate_config_structure(). + Assumes all_config_data has been validated by validate_master_config(). """ # Validate runner types if specified if args.runner_type: @@ -49,6 +49,8 @@ def generate_full_sweep(args, all_config_data, runner_data): if args.seq_lens: seq_lens_filter = {seq_len_stoi[sl] for sl in args.seq_lens} + # Iterate through all configurations and apply filters as specified (this is just "selecting" + # configs from all of the master configs subject to some pattern matching) for key, val in all_config_data.items(): # Filter by model prefix if specified if args.model_prefix: @@ -147,7 +149,7 @@ def generate_full_sweep(args, all_config_data, runner_data): Fields.DISAGG.value: disagg, } - validate_matrix_output(entry, is_multinode) + validate_matrix_entry(entry, is_multinode) matrix_values.append(entry) elif args.single_node: # Single-node configuration @@ -198,7 +200,7 @@ def generate_full_sweep(args, all_config_data, runner_data): if dp_attn is not None: entry[Fields.DP_ATTN.value] = dp_attn - validate_matrix_output(entry, is_multinode) + validate_matrix_entry(entry, is_multinode) matrix_values.append(entry) if conc == conc_end: diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py index 917457852..ba45f2ab5 100644 --- a/utils/matrix-logic/test_generate_sweep_configs.py +++ b/utils/matrix-logic/test_generate_sweep_configs.py @@ -1,74 +1,48 @@ +"""Comprehensive tests for generate_sweep_configs.py""" import pytest -import yaml -from unittest.mock import patch +import json +import argparse +from unittest.mock import patch, mock_open from generate_sweep_configs import ( + seq_len_stoi, + seq_len_itos, seq_len_to_str, generate_full_sweep, generate_runner_model_sweep_config, load_config_files, load_runner_file, - main, ) -# ============================================================================ -# Fixtures -# ============================================================================ +# ============================================================================= +# Test Fixtures +# ============================================================================= @pytest.fixture def sample_single_node_config(): - """Sample master config with single-node entries.""" + """Single node config based on dsr1-fp8-mi300x-sglang.""" return { - "dsr1-fp8-h200-sglang": { - "image": "lmsysorg/sglang:v0.5.5-cu129-amd64", + "dsr1-fp8-mi300x-sglang": { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", "model": "deepseek-ai/DeepSeek-R1-0528", "model-prefix": "dsr1", - "runner": "h200", "precision": "fp8", "framework": "sglang", + "runner": "mi300x", "multinode": False, "seq-len-configs": [ { "isl": 1024, "osl": 1024, "search-space": [ - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64, "ep": 2, "dp-attn": True} - ] - }, - { - "isl": 1024, - "osl": 8192, - "search-space": [ - {"tp": 8, "conc-start": 4, "conc-end": 32} + {"tp": 8, "conc-start": 4, "conc-end": 64} ] }, { "isl": 8192, "osl": 1024, "search-space": [ - {"tp": 8, "conc-start": 4, "conc-end": 16} - ] - } - ] - }, - "gptoss-fp4-b200-vllm": { - "image": "vllm/vllm-openai:v0.11.0", - "model": "openai/gpt-oss-120b", - "model-prefix": "gptoss", - "runner": "b200", - "precision": "fp4", - "framework": "vllm", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 1, "conc-start": 4, "conc-end": 128}, - {"tp": 2, "conc-start": 4, "conc-end": 128}, - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 8} + {"tp": 8, "conc-start": 4, "conc-end": 64} ] } ] @@ -78,15 +52,15 @@ def sample_single_node_config(): @pytest.fixture def sample_multinode_config(): - """Sample master config with multinode entries.""" + """Multinode config based on dsr1-fp4-gb200-dynamo-trt.""" return { "dsr1-fp4-gb200-dynamo-trt": { "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", "model-prefix": "dsr1", - "runner": "gb200", "precision": "fp4", "framework": "dynamo-trt", + "runner": "gb200", "multinode": True, "disagg": True, "seq-len-configs": [ @@ -95,39 +69,27 @@ def sample_multinode_config(): "osl": 1024, "search-space": [ { - "spec-decoding": "mtp", - "conc-list": [1, 2, 4, 8, 16, 36], + "conc-list": [2150], "prefill": { - "num-worker": 1, + "num-worker": 5, "tp": 4, "ep": 4, - "dp-attn": False, - "additional-settings": ["PREFILL_MAX_NUM_TOKENS=4608"] + "dp-attn": True, + "additional-settings": [ + "PREFILL_MAX_NUM_TOKENS=8448", + "PREFILL_MAX_BATCH_SIZE=1", + ], }, "decode": { - "num-worker": 4, + "num-worker": 1, "tp": 8, "ep": 8, - "dp-attn": False, - "additional-settings": ["DECODE_MAX_NUM_TOKENS=128"] - } - }, - { - "conc-list": [64, 128], - "prefill": { - "num-worker": 1, - "tp": 4, - "ep": 4, "dp-attn": True, - "additional-settings": [] + "additional-settings": [ + "DECODE_MAX_NUM_TOKENS=256", + "DECODE_MAX_BATCH_SIZE=256", + ], }, - "decode": { - "num-worker": 1, - "tp": 16, - "ep": 16, - "dp-attn": True, - "additional-settings": [] - } } ] } @@ -138,723 +100,694 @@ def sample_multinode_config(): @pytest.fixture def sample_runner_config(): - """Sample runner config.""" + """Runner config based on .github/configs/runners.yaml.""" return { - "h200": ["h200-nv_1", "h200-nv_2"], - "b200": ["b200-nv_1"], - "gb200": ["gb200-nv_1", "gb200-nv_2", "gb200-nv_3"], - "h100": ["h100-aws_1"] + "h100": ["h100-cr_0", "h100-cr_1", "h100-cw_0", "h100-cw_1"], + "h200": ["h200-cw_0", "h200-cw_1", "h200-nb_0", "h200-nb_1"], + "b200": ["b200-nvd_0", "b200-nvd_1", "b200-dgxc_1"], + "mi300x": ["mi300x-amd_0", "mi300x-amd_1", "mi300x-cr_0"], + "gb200": ["gb200-nv_0"], } @pytest.fixture -def temp_config_files(tmp_path, sample_single_node_config, sample_runner_config): - """Create temporary config files for single-node tests.""" - master_file = tmp_path / "master.yaml" - runner_file = tmp_path / "runners.yaml" - - with open(master_file, 'w') as f: - yaml.dump(sample_single_node_config, f) - - with open(runner_file, 'w') as f: - yaml.dump(sample_runner_config, f) - - return str(master_file), str(runner_file) +def full_sweep_args_single_node(): + """Args for full-sweep single-node command.""" + args = argparse.Namespace() + args.model_prefix = None + args.precision = None + args.framework = None + args.runner_type = None + args.seq_lens = None + args.step_size = 2 + args.max_conc = None + args.max_tp = None + args.max_ep = None + args.single_node = True + args.multi_node = False + return args @pytest.fixture -def temp_multinode_config_files(tmp_path, sample_multinode_config, sample_runner_config): - """Create temporary config files for multinode tests.""" - master_file = tmp_path / "master.yaml" - runner_file = tmp_path / "runners.yaml" - - with open(master_file, 'w') as f: - yaml.dump(sample_multinode_config, f) - - with open(runner_file, 'w') as f: - yaml.dump(sample_runner_config, f) - - return str(master_file), str(runner_file) - - -# ============================================================================ -# Helper class for mocking args -# ============================================================================ - -class MockArgs: - """Mock args object for testing functions.""" - def __init__(self, **kwargs): - # Defaults - self.model_prefix = None - self.precision = None - self.framework = None - self.runner_type = None - self.seq_lens = None - self.step_size = 2 - self.max_conc = None - self.max_tp = None - self.max_ep = None - self.single_node = False - self.multi_node = False - self.runner_config = None - self.runner_node_filter = None - - # Override with provided kwargs - for key, value in kwargs.items(): - setattr(self, key, value) - - -# ============================================================================ -# Tests for seq_len_to_str -# ============================================================================ +def full_sweep_args_multi_node(): + """Args for full-sweep multi-node command.""" + args = argparse.Namespace() + args.model_prefix = None + args.precision = None + args.framework = None + args.runner_type = None + args.seq_lens = None + args.step_size = 2 + args.max_conc = None + args.max_tp = None + args.max_ep = None + args.single_node = False + args.multi_node = True + return args + + +# ============================================================================= +# Test seq_len mappings +# ============================================================================= + +class TestSeqLenMappings: + """Tests for sequence length string mappings.""" + + def test_seq_len_stoi_values(self): + """Verify seq_len_stoi has expected mappings.""" + assert seq_len_stoi["1k1k"] == (1024, 1024) + assert seq_len_stoi["1k8k"] == (1024, 8192) + assert seq_len_stoi["8k1k"] == (8192, 1024) + + def test_seq_len_itos_reverse_mapping(self): + """Verify seq_len_itos is reverse of stoi.""" + assert seq_len_itos[(1024, 1024)] == "1k1k" + assert seq_len_itos[(1024, 8192)] == "1k8k" + assert seq_len_itos[(8192, 1024)] == "8k1k" + class TestSeqLenToStr: - """Tests for the seq_len_to_str function.""" + """Tests for seq_len_to_str function.""" - def test_known_mapping_1k1k(self): + def test_known_sequence_lengths(self): + """Known sequence lengths should return short name.""" assert seq_len_to_str(1024, 1024) == "1k1k" - - def test_known_mapping_1k8k(self): assert seq_len_to_str(1024, 8192) == "1k8k" - - def test_known_mapping_8k1k(self): assert seq_len_to_str(8192, 1024) == "8k1k" - def test_unknown_mapping_fallback(self): - assert seq_len_to_str(2048, 4096) == "2048_4096" - - def test_unknown_mapping_small_values(self): - assert seq_len_to_str(512, 512) == "512_512" - - -# ============================================================================ -# Tests for load_config_files -# ============================================================================ - -class TestLoadConfigFiles: - """Tests for the load_config_files function.""" - - def test_load_single_valid_file(self, temp_config_files): - master_file, _ = temp_config_files - result = load_config_files([master_file]) - assert len(result) == 2 - assert "dsr1-fp8-h200-sglang" in result - assert "gptoss-fp4-b200-vllm" in result + def test_unknown_sequence_lengths(self): + """Unknown sequence lengths should return isl_osl format.""" + assert seq_len_to_str(2048, 2048) == "2048_2048" + assert seq_len_to_str(4096, 1024) == "4096_1024" - def test_load_multiple_files(self, tmp_path, sample_single_node_config): - file1 = tmp_path / "config1.yaml" - file2 = tmp_path / "config2.yaml" - config1 = {"dsr1-fp8-h200-sglang": sample_single_node_config["dsr1-fp8-h200-sglang"]} - config2 = {"gptoss-fp4-b200-vllm": sample_single_node_config["gptoss-fp4-b200-vllm"]} - - with open(file1, 'w') as f: - yaml.dump(config1, f) - with open(file2, 'w') as f: - yaml.dump(config2, f) - - result = load_config_files([str(file1), str(file2)]) - assert len(result) == 2 - - def test_load_nonexistent_file(self): - with pytest.raises(ValueError, match="does not exist"): - load_config_files(["/nonexistent/file.yaml"]) - - def test_load_files_with_duplicate_keys(self, tmp_path, sample_single_node_config): - file1 = tmp_path / "config1.yaml" - file2 = tmp_path / "config2.yaml" - - config = {"dsr1-fp8-h200-sglang": sample_single_node_config["dsr1-fp8-h200-sglang"]} - - with open(file1, 'w') as f: - yaml.dump(config, f) - with open(file2, 'w') as f: - yaml.dump(config, f) - - with pytest.raises(ValueError, match="Duplicate configuration keys"): - load_config_files([str(file1), str(file2)]) - - -# ============================================================================ -# Tests for load_runner_file -# ============================================================================ - -class TestLoadRunnerFile: - """Tests for the load_runner_file function.""" - - def test_load_valid_runner_file(self, temp_config_files): - _, runner_file = temp_config_files - result = load_runner_file(runner_file) - assert "h200" in result - assert "b200" in result - - def test_load_nonexistent_runner_file(self): - with pytest.raises(ValueError, match="does not exist"): - load_runner_file("/nonexistent/runners.yaml") - - -# ============================================================================ -# Tests for generate_full_sweep - Single Node -# ============================================================================ +# ============================================================================= +# Test generate_full_sweep for single-node +# ============================================================================= class TestGenerateFullSweepSingleNode: - """Tests for generate_full_sweep with single-node configurations.""" - - def test_basic_sweep(self, sample_single_node_config, sample_runner_config): - args = MockArgs( - model_prefix=["dsr1"], - seq_lens=["1k1k"], - single_node=True + """Tests for generate_full_sweep with single-node configs.""" + + def test_basic_sweep_generation(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Basic single-node sweep should generate entries.""" + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) assert len(result) > 0 - assert all(entry['isl'] == 1024 and entry['osl'] == 1024 for entry in result) - - def test_sweep_no_filters(self, sample_single_node_config, sample_runner_config): - args = MockArgs(single_node=True) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) + # With step_size=2, conc goes 4, 8, 16, 32, 64 = 5 values per seq-len config + # 2 seq-len configs * 5 = 10 entries + assert len(result) == 10 + + def test_matrix_entry_structure(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Generated entries should have correct structure.""" + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + entry = result[0] + assert entry["image"] == "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915" + assert entry["model"] == "deepseek-ai/DeepSeek-R1-0528" + assert entry["precision"] == "fp8" + assert entry["framework"] == "sglang" + assert entry["runner"] == "mi300x" + assert entry["tp"] == 8 + assert "exp-name" in entry + assert "max-model-len" in entry + + def test_filter_by_model_prefix(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by model prefix should work.""" + full_sweep_args_single_node.model_prefix = ["dsr1"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) assert len(result) > 0 - def test_sweep_returns_empty_when_no_matches(self, sample_single_node_config, sample_runner_config): - args = MockArgs( - model_prefix=["nonexistent"], - single_node=True + # Non-matching prefix should return empty + full_sweep_args_single_node.model_prefix = ["nonexistent"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - assert result == [] - - def test_filter_by_precision(self, sample_single_node_config, sample_runner_config): - args = MockArgs( - precision=["fp8"], - single_node=True + assert len(result) == 0 + + def test_filter_by_precision(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by precision should work.""" + full_sweep_args_single_node.precision = ["fp8"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - assert all(entry['precision'] == 'fp8' for entry in result) + assert len(result) > 0 - def test_filter_by_framework(self, sample_single_node_config, sample_runner_config): - args = MockArgs( - framework=["vllm"], - single_node=True + full_sweep_args_single_node.precision = ["fp4"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - assert all(entry['framework'] == 'vllm' for entry in result) - - def test_filter_by_runner_type(self, sample_single_node_config, sample_runner_config): - args = MockArgs( - runner_type=["h200"], - single_node=True + assert len(result) == 0 + + def test_filter_by_framework(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by framework should work.""" + full_sweep_args_single_node.framework = ["sglang"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - assert all(entry['runner'] == 'h200' for entry in result) + assert len(result) > 0 - def test_invalid_runner_type_raises_error(self, sample_single_node_config, sample_runner_config): - args = MockArgs( - runner_type=["invalid-runner"], - single_node=True + full_sweep_args_single_node.framework = ["vllm"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config ) - with pytest.raises(ValueError, match="Invalid runner type"): - generate_full_sweep(args, sample_single_node_config, sample_runner_config) + assert len(result) == 0 + + def test_filter_by_runner_type(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by runner type should work.""" + full_sweep_args_single_node.runner_type = ["mi300x"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 - def test_multiple_runner_types(self, sample_single_node_config, sample_runner_config): - args = MockArgs( - runner_type=["h200", "b200"], - single_node=True + full_sweep_args_single_node.runner_type = ["h100"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0 + + def test_invalid_runner_type_raises_error(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Invalid runner type should raise ValueError.""" + full_sweep_args_single_node.runner_type = ["invalid_runner"] + with pytest.raises(ValueError) as exc_info: + generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert "Invalid runner type" in str(exc_info.value) + + def test_filter_by_seq_lens(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Filter by sequence lengths should work.""" + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - runners = set(entry['runner'] for entry in result) - assert 'h200' in runners or 'b200' in runners - - def test_filter_by_seq_lens(self, sample_single_node_config, sample_runner_config): - args = MockArgs( - model_prefix=["dsr1"], - seq_lens=["1k8k"], - single_node=True + # Only 1k1k entries, 5 concurrency values + assert len(result) == 5 + assert all(entry["isl"] == 1024 and entry["osl"] == 1024 for entry in result) + + def test_max_conc_filter(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_conc filter should limit concurrency values.""" + full_sweep_args_single_node.max_conc = 16 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - assert all(entry['isl'] == 1024 and entry['osl'] == 8192 for entry in result) - - def test_filter_multiple_seq_lens(self, sample_single_node_config, sample_runner_config): - args = MockArgs( - model_prefix=["dsr1"], - seq_lens=["1k1k", "8k1k"], - single_node=True + # conc values: 4, 8, 16 (32, 64 filtered out) + assert len(result) == 3 + assert all(entry["conc"] <= 16 for entry in result) + + def test_max_tp_filter(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_tp filter should limit TP values.""" + full_sweep_args_single_node.max_tp = 4 + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - seq_lens = set((e['isl'], e['osl']) for e in result) - assert (1024, 1024) in seq_lens - assert (8192, 1024) in seq_lens - - def test_step_size(self, sample_single_node_config, sample_runner_config): - args = MockArgs( - model_prefix=["gptoss"], - seq_lens=["1k1k"], - step_size=4, - single_node=True + # tp=8 is filtered out, so no results + assert len(result) == 0 + + def test_step_size(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """Different step sizes should affect concurrency progression.""" + full_sweep_args_single_node.step_size = 4 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - # With step_size=4, starting from 4: 4, 16, 64, 128 (or clamped) - conc_values = set(e['conc'] for e in result) + # conc: 4, 16, 64 = 3 values + assert len(result) == 3 + conc_values = [entry["conc"] for entry in result] assert 4 in conc_values + assert 16 in conc_values + assert 64 in conc_values + + def test_exp_name_format(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """exp-name should have correct format.""" + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert all(entry["exp-name"] == "dsr1_1k1k" for entry in result) + + def test_max_model_len_calculation(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max-model-len should be isl + osl + 200.""" + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + for entry in result: + expected_max_model_len = entry["isl"] + entry["osl"] + 200 + assert entry["max-model-len"] == expected_max_model_len + + +# ============================================================================= +# Test generate_full_sweep for multi-node +# ============================================================================= - def test_max_conc_filter(self, sample_single_node_config, sample_runner_config): - args = MockArgs( - model_prefix=["dsr1"], - seq_lens=["1k1k"], - max_conc=16, - single_node=True +class TestGenerateFullSweepMultiNode: + """Tests for generate_full_sweep with multi-node configs.""" + + def test_multinode_sweep_generation(self, sample_multinode_config, sample_runner_config, full_sweep_args_multi_node): + """Multinode sweep should generate entries with prefill/decode.""" + result = generate_full_sweep( + full_sweep_args_multi_node, + sample_multinode_config, + sample_runner_config + ) + assert len(result) == 1 # One entry with conc-list + + def test_multinode_entry_structure(self, sample_multinode_config, sample_runner_config, full_sweep_args_multi_node): + """Multinode entries should have prefill and decode configs.""" + result = generate_full_sweep( + full_sweep_args_multi_node, + sample_multinode_config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - assert all(entry['conc'] <= 16 for entry in result) - - def test_max_tp_filter(self, sample_single_node_config, sample_runner_config): - args = MockArgs( - model_prefix=["dsr1"], - seq_lens=["1k1k"], - max_tp=4, - single_node=True + entry = result[0] + assert "prefill" in entry + assert "decode" in entry + assert entry["prefill"]["num-worker"] == 5 + assert entry["decode"]["num-worker"] == 1 + assert entry["disagg"] is True + + def test_multinode_conc_as_list(self, sample_multinode_config, sample_runner_config, full_sweep_args_multi_node): + """Multinode conc should be passed as list.""" + result = generate_full_sweep( + full_sweep_args_multi_node, + sample_multinode_config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - assert all(entry['tp'] <= 4 for entry in result) - - def test_max_ep_filter(self, sample_single_node_config, sample_runner_config): - args = MockArgs( - model_prefix=["dsr1"], - seq_lens=["1k1k"], - max_ep=1, - single_node=True + entry = result[0] + assert isinstance(entry["conc"], list) + assert entry["conc"] == [2150] + + def test_single_node_flag_skips_multinode(self, sample_multinode_config, sample_runner_config, full_sweep_args_single_node): + """Single-node flag should skip multinode configs.""" + result = generate_full_sweep( + full_sweep_args_single_node, + sample_multinode_config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - # Should exclude entries with ep > 1 - assert all(entry['ep'] <= 1 for entry in result) + assert len(result) == 0 + - def test_concurrency_overshoot_clamped(self, sample_runner_config): - """Test that concurrency values are clamped to conc-end.""" +# ============================================================================= +# Test generate_runner_model_sweep_config +# ============================================================================= + +class TestGenerateRunnerModelSweepConfig: + """Tests for generate_runner_model_sweep_config function.""" + + @pytest.fixture + def runner_sweep_args(self): + """Args for runner-model-sweep command.""" + args = argparse.Namespace() + args.runner_type = "mi300x" + args.runner_config = "runners.yaml" + args.runner_node_filter = None + return args + + def test_basic_runner_sweep(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Basic runner sweep should generate entries for each node.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + # 3 mi300x nodes + assert len(result) == 3 + + def test_runner_sweep_entry_structure(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Runner sweep entries should use 1k1k config.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + for entry in result: + assert entry["isl"] == 1024 + assert entry["osl"] == 1024 + assert entry["max-model-len"] == 2048 + assert "_test" in entry["exp-name"] + + def test_each_node_gets_entry(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Each runner node should get its own entry.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + runners = [entry["runner"] for entry in result] + assert "mi300x-amd_0" in runners + assert "mi300x-amd_1" in runners + assert "mi300x-cr_0" in runners + + def test_invalid_runner_type(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Invalid runner type should raise error.""" + runner_sweep_args.runner_type = "nonexistent" + with pytest.raises(ValueError) as exc_info: + generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + assert "does not exist" in str(exc_info.value) + + def test_runner_node_filter(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Runner node filter should limit nodes.""" + runner_sweep_args.runner_node_filter = "amd" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + # Only mi300x-amd_0 and mi300x-amd_1 match + assert len(result) == 2 + assert all("amd" in entry["runner"] for entry in result) + + def test_runner_node_filter_no_match(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Runner node filter with no matches should raise error.""" + runner_sweep_args.runner_node_filter = "nonexistent" + with pytest.raises(ValueError) as exc_info: + generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + assert "No runner nodes found" in str(exc_info.value) + + def test_uses_highest_tp(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Should use highest TP from search space.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + # Config has tp=8 + assert all(entry["tp"] == 8 for entry in result) + + def test_uses_lowest_conc(self, sample_single_node_config, sample_runner_config, runner_sweep_args): + """Should use lowest concurrency from search space.""" + result = generate_runner_model_sweep_config( + runner_sweep_args, + sample_single_node_config, + sample_runner_config + ) + # Config has conc-start=4 + assert all(entry["conc"] == 4 for entry in result) + + +# ============================================================================= +# Test load_config_files +# ============================================================================= + +class TestLoadConfigFiles: + """Tests for load_config_files function.""" + + def test_load_single_file(self, tmp_path): + """Should load a single config file.""" + config_file = tmp_path / "config.yaml" + config_file.write_text(""" +test-config: + image: test-image + model: test-model +""") + result = load_config_files([str(config_file)]) + assert "test-config" in result + assert result["test-config"]["image"] == "test-image" + + def test_load_multiple_files(self, tmp_path): + """Should merge multiple config files.""" + config1 = tmp_path / "config1.yaml" + config1.write_text(""" +config-one: + value: 1 +""") + config2 = tmp_path / "config2.yaml" + config2.write_text(""" +config-two: + value: 2 +""") + result = load_config_files([str(config1), str(config2)]) + assert "config-one" in result + assert "config-two" in result + + def test_duplicate_keys_raise_error(self, tmp_path): + """Duplicate keys across files should raise error.""" + config1 = tmp_path / "config1.yaml" + config1.write_text(""" +duplicate-key: + value: 1 +""") + config2 = tmp_path / "config2.yaml" + config2.write_text(""" +duplicate-key: + value: 2 +""") + with pytest.raises(ValueError) as exc_info: + load_config_files([str(config1), str(config2)]) + assert "Duplicate configuration keys" in str(exc_info.value) + + def test_nonexistent_file_raises_error(self): + """Nonexistent file should raise error.""" + with pytest.raises(ValueError) as exc_info: + load_config_files(["nonexistent.yaml"]) + assert "does not exist" in str(exc_info.value) + + +# ============================================================================= +# Test load_runner_file +# ============================================================================= + +class TestLoadRunnerFile: + """Tests for load_runner_file function.""" + + def test_load_runner_file(self, tmp_path): + """Should load runner config file.""" + runner_file = tmp_path / "runners.yaml" + runner_file.write_text(""" +h100: +- h100-node-0 +- h100-node-1 +""") + result = load_runner_file(str(runner_file)) + assert "h100" in result + assert len(result["h100"]) == 2 + + def test_nonexistent_runner_file(self): + """Nonexistent runner file should raise error.""" + with pytest.raises(ValueError) as exc_info: + load_runner_file("nonexistent.yaml") + assert "does not exist" in str(exc_info.value) + + +# ============================================================================= +# Test edge cases and special configurations +# ============================================================================= + +class TestEdgeCases: + """Tests for edge cases and special configurations.""" + + def test_config_with_ep_and_dp_attn(self, sample_runner_config, full_sweep_args_single_node): + """Config with ep and dp-attn should be handled correctly.""" config = { "test-config": { - "image": "test:latest", - "model": "test/model", + "image": "test-image", + "model": "test-model", "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", + "precision": "fp4", + "framework": "sglang", + "runner": "b200", "multinode": False, "seq-len-configs": [ { "isl": 1024, "osl": 1024, "search-space": [ - {"tp": 4, "conc-start": 1, "conc-end": 5} + {"tp": 4, "ep": 4, "dp-attn": True, "conc-start": 4, "conc-end": 4} ] } ] } } - args = MockArgs(step_size=3, single_node=True) - result = generate_full_sweep(args, config, sample_runner_config) - conc_values = sorted(set(e['conc'] for e in result)) - # 1, 3, 9 -> clamped to 5 - assert conc_values == [1, 3, 5] - - def test_default_ep_dp_attn_values(self, sample_single_node_config, sample_runner_config): - """Test that entries without ep/dp-attn get default values.""" - args = MockArgs( - model_prefix=["dsr1"], - seq_lens=["1k1k"], - max_tp=4, # Filter to tp=4 which doesn't have ep/dp-attn - single_node=True - ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - # tp=4 entries should have default ep=1 and dp-attn=False - for entry in result: - if entry['tp'] == 4: - assert entry['ep'] == 1 - assert entry['dp-attn'] == False - - def test_explicit_ep_dp_attn_values(self, sample_single_node_config, sample_runner_config): - """Test that entries with explicit ep/dp-attn use those values.""" - args = MockArgs( - model_prefix=["dsr1"], - seq_lens=["1k1k"], - single_node=True - ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - # tp=8 entries should have ep=2 and dp-attn=True - tp8_entries = [e for e in result if e['tp'] == 8] - assert all(e['ep'] == 2 for e in tp8_entries) - assert all(e['dp-attn'] == True for e in tp8_entries) - - def test_max_model_len_calculation(self, sample_single_node_config, sample_runner_config): - """Test that max-model-len is calculated as isl + osl + 200.""" - args = MockArgs( - model_prefix=["dsr1"], - seq_lens=["1k8k"], - single_node=True + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - # isl=1024, osl=8192 -> max-model-len = 1024 + 8192 + 200 = 9416 - assert all(e['max-model-len'] == 9416 for e in result) - - def test_exp_name_format(self, sample_single_node_config, sample_runner_config): - """Test that exp-name follows the expected format.""" - args = MockArgs( - model_prefix=["dsr1"], - seq_lens=["1k1k"], - single_node=True - ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - assert all(e['exp-name'] == 'dsr1_1k1k' for e in result) - - def test_disagg_defaults_to_false(self, sample_single_node_config, sample_runner_config): - """Test that disagg defaults to False when not specified.""" - args = MockArgs(single_node=True) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - assert all(e['disagg'] == False for e in result) - - def test_skips_multinode_configs_in_single_node_mode(self, sample_multinode_config, sample_runner_config): - """Test that multinode configs are skipped when --single-node is specified.""" - args = MockArgs(single_node=True) - result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) - assert result == [] - - -# ============================================================================ -# Tests for generate_full_sweep - Multi Node -# ============================================================================ - -class TestGenerateFullSweepMultiNode: - """Tests for generate_full_sweep with multinode configurations.""" - - def test_basic_multinode_sweep(self, sample_multinode_config, sample_runner_config): - args = MockArgs(multi_node=True) - result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) - assert len(result) > 0 - - def test_multinode_conc_is_list(self, sample_multinode_config, sample_runner_config): - """Test that multinode entries have conc as a list.""" - args = MockArgs(multi_node=True) - result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) - for entry in result: - assert isinstance(entry['conc'], list) - - def test_multinode_has_prefill_decode(self, sample_multinode_config, sample_runner_config): - """Test that multinode entries have prefill and decode configs.""" - args = MockArgs(multi_node=True) - result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) - for entry in result: - assert 'prefill' in entry - assert 'decode' in entry - - def test_multinode_spec_decoding_defaults_to_none(self, sample_multinode_config, sample_runner_config): - """Test that spec-decoding defaults to 'none' if not specified.""" - args = MockArgs(multi_node=True) - result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) - # The second search-space entry doesn't specify spec-decoding - for entry in result: - assert entry['spec-decoding'] in ['mtp', 'none'] - - def test_multinode_disagg_value(self, sample_multinode_config, sample_runner_config): - """Test that disagg is properly passed through.""" - args = MockArgs(multi_node=True) - result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) - # The sample config has disagg=True - assert all(e['disagg'] == True for e in result) - - def test_multinode_max_conc_filter(self, sample_multinode_config, sample_runner_config): - """Test max_conc filter works with multinode conc lists.""" - args = MockArgs(multi_node=True, max_conc=8) - result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) - for entry in result: - assert all(c <= 8 for c in entry['conc']) - - def test_multinode_max_conc_filters_out_empty(self, sample_multinode_config, sample_runner_config): - """Test that entries with no valid conc values are filtered out.""" - args = MockArgs(multi_node=True, max_conc=0) - result = generate_full_sweep(args, sample_multinode_config, sample_runner_config) - assert result == [] - - def test_skips_single_node_configs_in_multi_node_mode(self, sample_single_node_config, sample_runner_config): - """Test that single-node configs are skipped when --multi-node is specified.""" - args = MockArgs(multi_node=True) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - assert result == [] - - -# ============================================================================ -# Tests for generate_runner_model_sweep_config -# ============================================================================ - -class TestGenerateRunnerModelSweepConfig: - """Tests for the generate_runner_model_sweep_config function.""" - - def test_basic_runner_model_sweep(self, sample_single_node_config, sample_runner_config): - args = MockArgs(runner_type="h200") - result = generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) - assert len(result) > 0 - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' in runners - - def test_invalid_runner_type(self, sample_single_node_config, sample_runner_config): - args = MockArgs(runner_type="invalid-runner") - with pytest.raises(ValueError, match="does not exist in runner config"): - generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) - - def test_runner_node_filter(self, sample_single_node_config, sample_runner_config): - args = MockArgs(runner_type="h200", runner_node_filter="nv_1") - result = generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' not in runners - - def test_runner_node_filter_multiple_matches(self, sample_single_node_config, sample_runner_config): - args = MockArgs(runner_type="h200", runner_node_filter="nv") - result = generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' in runners - - def test_runner_node_filter_no_matches(self, sample_single_node_config, sample_runner_config): - args = MockArgs(runner_type="h200", runner_node_filter="nonexistent") - with pytest.raises(ValueError, match="No runner nodes found matching filter"): - generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) - - def test_uses_highest_tp_lowest_conc(self, sample_single_node_config, sample_runner_config): - """Test that it uses highest TP with lowest concurrency.""" - args = MockArgs(runner_type="h200") - result = generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) - # dsr1 config has tp=4 (conc 4-64) and tp=8 (conc 4-64), should pick tp=8, conc=4 - for entry in result: - assert entry['tp'] == 8 - assert entry['conc'] == 4 - - def test_always_uses_1k1k(self, sample_single_node_config, sample_runner_config): - """Test that it always uses 1k1k sequence lengths.""" - args = MockArgs(runner_type="h200") - result = generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) - assert all(entry['isl'] == 1024 and entry['osl'] == 1024 for entry in result) - - def test_exp_name_has_test_suffix(self, sample_single_node_config, sample_runner_config): - """Test that exp-name has _test suffix.""" - args = MockArgs(runner_type="h200") - result = generate_runner_model_sweep_config(args, sample_single_node_config, sample_runner_config) - assert all('_test' in entry['exp-name'] for entry in result) - - -# ============================================================================ -# Tests for main function -# ============================================================================ - -class TestMain: - """Tests for the main function with CLI argument parsing.""" - - def test_main_full_sweep_single_node(self, temp_config_files): - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--seq-lens", "1k1k", - "--model-prefix", "dsr1", - "--step-size", "2", - "--single-node" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - def test_main_full_sweep_multi_node(self, temp_multinode_config_files): - master_file, runner_file = temp_multinode_config_files - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--multi-node" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - def test_main_full_sweep_with_filters(self, temp_config_files): - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--model-prefix", "dsr1", - "--precision", "fp8", - "--framework", "sglang", - "--single-node" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - assert all(entry['precision'] == 'fp8' for entry in result) - assert all(entry['framework'] == 'sglang' for entry in result) - - def test_main_full_sweep_empty_result(self, temp_config_files): - """Test that empty results are returned without error.""" - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "full-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--model-prefix", "nonexistent", - "--single-node" - ] - - with patch('sys.argv', test_args): - result = main() - assert result == [] - - def test_main_runner_model_sweep(self, temp_config_files): - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "runner-model-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-type", "h200" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - - def test_main_runner_model_sweep_with_filter(self, temp_config_files): - master_file, runner_file = temp_config_files - - test_args = [ - "generate_sweep_configs.py", - "runner-model-sweep", - "--config-files", master_file, - "--runner-config", runner_file, - "--runner-type", "h200", - "--runner-node-filter", "nv_1" - ] - - with patch('sys.argv', test_args): - result = main() - assert len(result) > 0 - runners = set(entry['runner'] for entry in result) - assert 'h200-nv_1' in runners - assert 'h200-nv_2' not in runners - - -# ============================================================================ -# Edge case tests -# ============================================================================ + assert len(result) == 1 + assert result[0]["ep"] == 4 + assert result[0]["dp-attn"] is True -class TestEdgeCases: - """Edge case tests.""" + def test_config_with_spec_decoding(self, sample_runner_config, full_sweep_args_single_node): + """Config with spec-decoding should be handled correctly.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "trt", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "spec-decoding": "mtp", "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + assert len(result) == 1 + assert result[0]["spec-decoding"] == "mtp" - def test_concurrency_range_equals_start_end(self, sample_runner_config): - """Test when conc-start equals conc-end.""" + def test_conc_list_in_single_node(self, sample_runner_config, full_sweep_args_single_node): + """Single node config with conc-list should work.""" config = { "test-config": { - "image": "test:latest", - "model": "test/model", + "image": "test-image", + "model": "test-model", "model-prefix": "test", "precision": "fp8", - "framework": "vllm", - "runner": "h200", + "framework": "sglang", + "runner": "mi300x", "multinode": False, "seq-len-configs": [ { "isl": 1024, "osl": 1024, "search-space": [ - {"tp": 4, "conc-start": 8, "conc-end": 8} + {"tp": 8, "conc-start": 4, "conc-end": 16} ] } ] } } - args = MockArgs(single_node=True) - result = generate_full_sweep(args, config, sample_runner_config) - assert len(result) == 1 - assert result[0]['conc'] == 8 - - def test_multiple_model_prefixes(self, sample_single_node_config, sample_runner_config): - """Test filtering with multiple model prefixes.""" - args = MockArgs( - model_prefix=["dsr1", "gptoss"], - seq_lens=["1k1k"], - single_node=True - ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - exp_names = [e['exp-name'] for e in result] - assert any('dsr1' in name for name in exp_names) - assert any('gptoss' in name for name in exp_names) - - def test_combined_max_filters(self, sample_single_node_config, sample_runner_config): - """Test combining max_tp, max_ep, and max_conc filters.""" - args = MockArgs( - model_prefix=["dsr1"], - seq_lens=["1k1k"], - max_tp=4, - max_conc=8, - single_node=True + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config ) - result = generate_full_sweep(args, sample_single_node_config, sample_runner_config) - assert all(entry['tp'] <= 4 for entry in result) - assert all(entry['conc'] <= 8 for entry in result) + conc_values = [entry["conc"] for entry in result] + assert 4 in conc_values + assert 8 in conc_values + assert 16 in conc_values - def test_multinode_conc_range_instead_of_list(self, sample_runner_config): - """Test multinode config with conc-start/conc-end instead of conc-list.""" + def test_disagg_defaults_to_false(self, sample_runner_config, full_sweep_args_single_node): + """disagg should default to False when not specified.""" config = { - "multinode-config": { - "image": "test:latest", - "model": "test/model", + "test-config": { + "image": "test-image", + "model": "test-model", "model-prefix": "test", - "runner": "gb200", "precision": "fp8", + "framework": "sglang", + "runner": "mi300x", + "multinode": False, + # No disagg field + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + assert result[0]["disagg"] is False + + def test_multinode_conc_range_expansion(self, sample_runner_config, full_sweep_args_multi_node): + """Multinode with conc range should expand to list.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", "framework": "dynamo-trt", + "runner": "gb200", "multinode": True, - "disagg": True, "seq-len-configs": [ { "isl": 1024, "osl": 1024, "search-space": [ { - "conc-start": 4, - "conc-end": 16, + "conc-start": 1, + "conc-end": 8, "prefill": { "num-worker": 1, "tp": 4, "ep": 4, - "dp-attn": False + "dp-attn": False, }, "decode": { "num-worker": 1, "tp": 8, "ep": 8, - "dp-attn": False - } + "dp-attn": False, + }, } ] } ] } } - args = MockArgs(multi_node=True, step_size=2) - result = generate_full_sweep(args, config, sample_runner_config) + result = generate_full_sweep( + full_sweep_args_multi_node, + config, + sample_runner_config + ) assert len(result) == 1 - # conc should be [4, 8, 16] - assert result[0]['conc'] == [4, 8, 16] - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) + # step_size=2: 1, 2, 4, 8 + assert result[0]["conc"] == [1, 2, 4, 8] diff --git a/utils/matrix-logic/test_validation.py b/utils/matrix-logic/test_validation.py index 6a9dc9683..003278feb 100644 --- a/utils/matrix-logic/test_validation.py +++ b/utils/matrix-logic/test_validation.py @@ -1,8 +1,6 @@ +"""Comprehensive tests for validation.py""" import pytest from validation import ( - validate_master_config, - validate_matrix_output, - validate_runner_config, Fields, SingleNodeMatrixEntry, MultiNodeMatrixEntry, @@ -13,1270 +11,728 @@ MultiNodeSeqLenConfig, SingleNodeMasterConfigEntry, MultiNodeMasterConfigEntry, + validate_matrix_entry, + validate_master_config, + validate_runner_config, ) -# ============================================================================ -# Tests for validate_master_config - Single Node -# ============================================================================ - -class TestValidateMasterConfigSingleNode: - """Tests for validate_master_config with single-node configurations.""" - - def test_valid_single_node_config(self): - """Test validation of a valid single-node config.""" - config = { - "test-fp8-h200-vllm": { - "image": "vllm/vllm-openai:v0.11.0", - "model": "meta-llama/Llama-3-70b", - "model-prefix": "llama70b", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 4, "conc-end": 64}, - {"tp": 8, "conc-start": 4, "conc-end": 64, "ep": 2, "dp-attn": True} - ] - } - ] - } - } - result = validate_master_config(config) - assert result == config - - def test_valid_single_node_with_disagg(self): - """Test validation with disagg field.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "disagg": True, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 4, "conc-end": 64} - ] - } - ] - } - } - result = validate_master_config(config) - assert result == config - - def test_valid_single_node_with_conc_list(self): - """Test validation with conc-list instead of conc-start/end.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-list": [1, 2, 4, 8, 16]} - ] - } - ] - } - } - result = validate_master_config(config) - assert result == config - - def test_missing_required_field_image(self): - """Test validation fails when image is missing.""" - config = { - "test-config": { - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_missing_required_field_model(self): - """Test validation fails when model is missing.""" - config = { - "test-config": { - "image": "test:latest", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_missing_required_field_model_prefix(self): - """Test validation fails when model-prefix is missing.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_missing_required_field_precision(self): - """Test validation fails when precision is missing.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_missing_required_field_framework(self): - """Test validation fails when framework is missing.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_missing_required_field_runner(self): - """Test validation fails when runner is missing.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_missing_required_field_multinode(self): - """Test validation fails when multinode is missing.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_missing_required_field_seq_len_configs(self): - """Test validation fails when seq-len-configs is missing.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_wrong_type_image(self): - """Test validation fails when image has wrong type.""" - config = { - "test-config": { - "image": 123, - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_empty_seq_len_configs(self): - """Test that empty seq-len-configs is allowed by validation. - - Note: Pydantic allows empty lists by default. This may produce - no output at runtime but is not a validation error. - """ - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [] - } - } - # This is allowed - Pydantic doesn't enforce non-empty lists by default - result = validate_master_config(config) - assert result == config - - def test_missing_isl_in_seq_len_config(self): - """Test validation fails when isl is missing.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "osl": 1024, - "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_missing_osl_in_seq_len_config(self): - """Test validation fails when osl is missing.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_missing_search_space(self): - """Test validation fails when search-space is missing.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024 - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_empty_search_space(self): - """Test that empty search-space is allowed by validation. - - Note: Pydantic allows empty lists by default. This may produce - no output at runtime but is not a validation error. - """ - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [] - } - ] - } - } - # This is allowed - Pydantic doesn't enforce non-empty lists by default - result = validate_master_config(config) - assert result == config - - def test_missing_tp_in_search_space(self): - """Test validation fails when tp is missing in search-space.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_missing_both_conc_range_and_list(self): - """Test validation fails when neither conc-start/end nor conc-list is provided.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_both_conc_range_and_list_provided(self): - """Test validation fails when both conc-start/end and conc-list are provided.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 4, "conc-end": 64, "conc-list": [1, 2, 4]} - ] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_conc_start_greater_than_end(self): - """Test validation fails when conc-start > conc-end.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4, "conc-start": 64, "conc-end": 4}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_conc_list_with_zero_value(self): - """Test validation fails when conc-list contains zero.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4, "conc-list": [0, 1, 2]}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_wrong_type_tp(self): - """Test validation fails when tp has wrong type.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": "four", "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_wrong_type_ep(self): - """Test validation fails when ep has wrong type.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4, "ep": "two", "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) - - def test_wrong_type_dp_attn(self): - """Test validation fails when dp-attn has a truly invalid type. - - Note: Pydantic coerces some string values to bools (e.g., "yes" -> True). - We test with a value that cannot be coerced. - """ - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4, "dp-attn": [1, 2, 3], "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) +# ============================================================================= +# Test Fixtures +# ============================================================================= + +@pytest.fixture +def valid_single_node_matrix_entry(): + """Valid single node matrix entry based on dsr1-fp4-mi355x-sglang config.""" + return { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", + "model": "amd/DeepSeek-R1-0528-MXFP4-Preview", + "precision": "fp4", + "framework": "sglang", + "spec-decoding": "none", + "runner": "mi355x", + "isl": 1024, + "osl": 1024, + "tp": 8, + "ep": 1, + "dp-attn": False, + "conc": 4, + "max-model-len": 2248, + "exp-name": "dsr1_1k1k", + "disagg": False, + } + + +@pytest.fixture +def valid_multinode_matrix_entry(): + """Valid multinode matrix entry based on dsr1-fp4-gb200-dynamo-trt config.""" + return { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "precision": "fp4", + "framework": "dynamo-trt", + "spec-decoding": "none", + "runner": "gb200", + "isl": 1024, + "osl": 1024, + "prefill": { + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + "additional-settings": [ + "PREFILL_MAX_NUM_TOKENS=8448", + "PREFILL_MAX_BATCH_SIZE=1", + ], + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + "additional-settings": [ + "DECODE_MAX_NUM_TOKENS=256", + "DECODE_MAX_BATCH_SIZE=256", + "DECODE_GPU_MEM_FRACTION=0.8", + "DECODE_MTP_SIZE=0", + ], + }, + "conc": [2150], + "max-model-len": 2248, + "exp-name": "dsr1_1k1k", + "disagg": True, + } + + +@pytest.fixture +def valid_single_node_master_config(): + """Valid single node master config based on dsr1-fp8-mi300x-sglang.""" + return { + "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915", + "model": "deepseek-ai/DeepSeek-R1-0528", + "model-prefix": "dsr1", + "precision": "fp8", + "framework": "sglang", + "runner": "mi300x", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + } + ] + } + + +@pytest.fixture +def valid_multinode_master_config(): + """Valid multinode master config based on dsr1-fp4-gb200-dynamo-trt.""" + return { + "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", + "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "disagg": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "prefill": { + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + "additional-settings": [ + "PREFILL_MAX_NUM_TOKENS=8448", + "PREFILL_MAX_BATCH_SIZE=1", + ], + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + "additional-settings": [ + "DECODE_MAX_NUM_TOKENS=256", + "DECODE_MAX_BATCH_SIZE=256", + ], + }, + "conc-list": [2150], + } + ] + } + ] + } + + +@pytest.fixture +def valid_runner_config(): + """Valid runner config based on .github/configs/runners.yaml.""" + return { + "h100": ["h100-cr_0", "h100-cr_1", "h100-cw_0", "h100-cw_1"], + "h200": ["h200-cw_0", "h200-cw_1", "h200-nb_0", "h200-nb_1"], + "b200": ["b200-nvd_0", "b200-nvd_1", "b200-dgxc_1"], + "mi300x": ["mi300x-amd_0", "mi300x-amd_1", "mi300x-cr_0"], + "gb200": ["gb200-nv_0"], + } + + +# ============================================================================= +# Test Fields Enum +# ============================================================================= - def test_extra_field_in_top_level(self): - """Test validation fails when extra field is present at top level.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "extra-field": "not-allowed", - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [{"tp": 4, "conc-start": 4, "conc-end": 64}] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) +class TestFieldsEnum: + """Tests for Fields enum.""" - def test_extra_field_in_search_space(self): - """Test validation fails when extra field is present in search-space.""" - config = { - "test-config": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "multinode": False, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - {"tp": 4, "conc-start": 4, "conc-end": 64, "invalid-field": "value"} - ] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) + def test_field_values_are_strings(self): + """All field values should be strings.""" + for field in Fields: + assert isinstance(field.value, str) + def test_key_fields_exist(self): + """Key fields should be defined.""" + assert Fields.IMAGE.value == "image" + assert Fields.MODEL.value == "model" + assert Fields.TP.value == "tp" + assert Fields.MULTINODE.value == "multinode" + assert Fields.CONC.value == "conc" + assert Fields.SPEC_DECODING.value == "spec-decoding" + assert Fields.PREFILL.value == "prefill" + assert Fields.DECODE.value == "decode" -# ============================================================================ -# Tests for validate_master_config - Multi Node -# ============================================================================ -class TestValidateMasterConfigMultiNode: - """Tests for validate_master_config with multinode configurations.""" +# ============================================================================= +# Test WorkerConfig +# ============================================================================= - def test_valid_multinode_config(self): - """Test validation of a valid multinode config.""" - config = { - "test-multinode": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp4", - "framework": "dynamo-trt", - "runner": "gb200", - "multinode": True, - "disagg": True, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - { - "spec-decoding": "mtp", - "conc-list": [1, 2, 4, 8], - "prefill": { - "num-worker": 1, - "tp": 4, - "ep": 4, - "dp-attn": False, - "additional-settings": ["PREFILL_MAX_NUM_TOKENS=4608"] - }, - "decode": { - "num-worker": 4, - "tp": 8, - "ep": 8, - "dp-attn": False, - "additional-settings": ["DECODE_MAX_NUM_TOKENS=128"] - } - } - ] - } - ] - } - } - result = validate_master_config(config) - assert result == config +class TestWorkerConfig: + """Tests for WorkerConfig model.""" - def test_valid_multinode_with_conc_range(self): - """Test validation of multinode config with conc-start/conc-end.""" - config = { - "test-multinode": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp4", - "framework": "dynamo-trt", - "runner": "gb200", - "multinode": True, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - { - "conc-start": 4, - "conc-end": 64, - "prefill": { - "num-worker": 1, - "tp": 4, - "ep": 4, - "dp-attn": False - }, - "decode": { - "num-worker": 4, - "tp": 8, - "ep": 8, - "dp-attn": False - } - } - ] - } - ] - } - } - result = validate_master_config(config) - assert result == config + def test_valid_worker_config(self): + """Valid worker config should pass.""" + config = WorkerConfig(**{ + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + }) + assert config.num_worker == 5 + assert config.tp == 4 + assert config.ep == 4 + assert config.dp_attn is True - def test_multinode_missing_prefill(self): - """Test validation fails when prefill is missing in multinode config.""" - config = { - "test-multinode": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp4", - "framework": "dynamo-trt", - "runner": "gb200", - "multinode": True, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - { - "conc-list": [1, 2, 4], - "decode": { - "num-worker": 4, - "tp": 8, - "ep": 8, - "dp-attn": False - } - } - ] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) + def test_worker_config_with_additional_settings(self): + """Worker config with additional settings should pass.""" + config = WorkerConfig(**{ + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + "additional-settings": [ + "DECODE_MAX_NUM_TOKENS=256", + "DECODE_MAX_BATCH_SIZE=256", + "DECODE_GPU_MEM_FRACTION=0.8", + ], + }) + assert len(config.additional_settings) == 3 + assert "DECODE_MAX_NUM_TOKENS=256" in config.additional_settings + + def test_worker_config_missing_required_field(self): + """Missing required field should fail.""" + with pytest.raises(Exception): + WorkerConfig(**{ + "num-worker": 2, + "tp": 4, + # Missing ep and dp-attn + }) + + def test_worker_config_extra_field_forbidden(self): + """Extra fields should be forbidden.""" + with pytest.raises(Exception): + WorkerConfig(**{ + "num-worker": 2, + "tp": 4, + "ep": 1, + "dp-attn": False, + "unknown-field": "value", + }) + + +# ============================================================================= +# Test SingleNodeMatrixEntry +# ============================================================================= + +class TestSingleNodeMatrixEntry: + """Tests for SingleNodeMatrixEntry model.""" + + def test_valid_entry(self, valid_single_node_matrix_entry): + """Valid entry should pass validation.""" + entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + assert entry.image == "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915" + assert entry.tp == 8 + assert entry.conc == 4 + assert entry.framework == "sglang" + + def test_conc_as_list(self, valid_single_node_matrix_entry): + """Conc can be a list of integers.""" + valid_single_node_matrix_entry["conc"] = [4, 8, 16, 32, 64] + entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + assert entry.conc == [4, 8, 16, 32, 64] + + def test_spec_decoding_values(self, valid_single_node_matrix_entry): + """Spec decoding should accept valid literal values.""" + for value in ["mtp", "draft_model", "none"]: + valid_single_node_matrix_entry["spec-decoding"] = value + entry = SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + assert entry.spec_decoding == value + + def test_invalid_spec_decoding(self, valid_single_node_matrix_entry): + """Invalid spec decoding value should fail.""" + valid_single_node_matrix_entry["spec-decoding"] = "invalid" + with pytest.raises(Exception): + SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + + def test_missing_required_field(self, valid_single_node_matrix_entry): + """Missing required field should fail validation.""" + del valid_single_node_matrix_entry["model"] + with pytest.raises(Exception): + SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + + def test_extra_field_forbidden(self, valid_single_node_matrix_entry): + """Extra fields should be forbidden.""" + valid_single_node_matrix_entry["extra-field"] = "value" + with pytest.raises(Exception): + SingleNodeMatrixEntry(**valid_single_node_matrix_entry) + + +# ============================================================================= +# Test MultiNodeMatrixEntry +# ============================================================================= + +class TestMultiNodeMatrixEntry: + """Tests for MultiNodeMatrixEntry model.""" + + def test_valid_entry(self, valid_multinode_matrix_entry): + """Valid entry should pass validation.""" + entry = MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + assert entry.model == "deepseek-r1-fp4" + assert entry.conc == [2150] + assert entry.disagg is True + + def test_prefill_decode_worker_configs(self, valid_multinode_matrix_entry): + """Prefill and decode should be WorkerConfig objects.""" + entry = MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + assert entry.prefill.num_worker == 5 + assert entry.prefill.tp == 4 + assert entry.decode.tp == 8 + assert entry.decode.dp_attn is True + + def test_conc_must_be_list(self, valid_multinode_matrix_entry): + """Conc must be a list for multinode.""" + valid_multinode_matrix_entry["conc"] = 2150 # Single int, not list + with pytest.raises(Exception): + MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + + def test_missing_prefill(self, valid_multinode_matrix_entry): + """Missing prefill should fail.""" + del valid_multinode_matrix_entry["prefill"] + with pytest.raises(Exception): + MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + + def test_missing_decode(self, valid_multinode_matrix_entry): + """Missing decode should fail.""" + del valid_multinode_matrix_entry["decode"] + with pytest.raises(Exception): + MultiNodeMatrixEntry(**valid_multinode_matrix_entry) + + +# ============================================================================= +# Test validate_matrix_entry function +# ============================================================================= + +class TestValidateMatrixEntry: + """Tests for validate_matrix_entry function.""" + + def test_valid_single_node(self, valid_single_node_matrix_entry): + """Valid single node entry should return the entry.""" + result = validate_matrix_entry(valid_single_node_matrix_entry, is_multinode=False) + assert result == valid_single_node_matrix_entry + + def test_valid_multinode(self, valid_multinode_matrix_entry): + """Valid multinode entry should return the entry.""" + result = validate_matrix_entry(valid_multinode_matrix_entry, is_multinode=True) + assert result == valid_multinode_matrix_entry + + def test_invalid_single_node_raises_valueerror(self, valid_single_node_matrix_entry): + """Invalid single node entry should raise ValueError.""" + del valid_single_node_matrix_entry["tp"] + with pytest.raises(ValueError) as exc_info: + validate_matrix_entry(valid_single_node_matrix_entry, is_multinode=False) + assert "failed validation" in str(exc_info.value) + + def test_invalid_multinode_raises_valueerror(self, valid_multinode_matrix_entry): + """Invalid multinode entry should raise ValueError.""" + del valid_multinode_matrix_entry["prefill"] + with pytest.raises(ValueError) as exc_info: + validate_matrix_entry(valid_multinode_matrix_entry, is_multinode=True) + assert "failed validation" in str(exc_info.value) + + +# ============================================================================= +# Test SingleNodeSearchSpaceEntry +# ============================================================================= - def test_multinode_missing_decode(self): - """Test validation fails when decode is missing in multinode config.""" - config = { - "test-multinode": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp4", - "framework": "dynamo-trt", - "runner": "gb200", - "multinode": True, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - { - "conc-list": [1, 2, 4], - "prefill": { - "num-worker": 1, - "tp": 4, - "ep": 4, - "dp-attn": False - } - } - ] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) +class TestSingleNodeSearchSpaceEntry: + """Tests for SingleNodeSearchSpaceEntry model.""" - def test_multinode_invalid_spec_decoding(self): - """Test validation fails when spec-decoding has invalid value.""" - config = { - "test-multinode": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp4", - "framework": "dynamo-trt", - "runner": "gb200", - "multinode": True, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - { - "spec-decoding": "invalid-value", - "conc-list": [1, 2, 4], - "prefill": { - "num-worker": 1, - "tp": 4, - "ep": 4, - "dp-attn": False - }, - "decode": { - "num-worker": 4, - "tp": 8, - "ep": 8, - "dp-attn": False - } - } - ] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) + def test_valid_with_conc_range(self): + """Valid entry with conc range should pass (like mi300x config).""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 8, + "conc-start": 4, + "conc-end": 64, + }) + assert entry.tp == 8 + assert entry.conc_start == 4 + assert entry.conc_end == 64 - def test_multinode_worker_config_missing_num_worker(self): - """Test validation fails when num-worker is missing in worker config.""" - config = { - "test-multinode": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp4", - "framework": "dynamo-trt", - "runner": "gb200", - "multinode": True, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - { - "conc-list": [1, 2, 4], - "prefill": { - "tp": 4, - "ep": 4, - "dp-attn": False - }, - "decode": { - "num-worker": 4, - "tp": 8, - "ep": 8, - "dp-attn": False - } - } - ] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) + def test_valid_with_conc_list(self): + """Valid entry with conc list should pass.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "conc-list": [4, 8, 16, 32, 64, 128], + }) + assert entry.conc_list == [4, 8, 16, 32, 64, 128] - def test_multinode_worker_config_missing_tp(self): - """Test validation fails when tp is missing in worker config.""" - config = { - "test-multinode": { - "image": "test:latest", - "model": "test/model", - "model-prefix": "test", - "precision": "fp4", - "framework": "dynamo-trt", - "runner": "gb200", - "multinode": True, - "seq-len-configs": [ - { - "isl": 1024, - "osl": 1024, - "search-space": [ - { - "conc-list": [1, 2, 4], - "prefill": { - "num-worker": 1, - "ep": 4, - "dp-attn": False - }, - "decode": { - "num-worker": 4, - "tp": 8, - "ep": 8, - "dp-attn": False - } - } - ] - } - ] - } - } - with pytest.raises(ValueError, match="failed validation"): - validate_master_config(config) + def test_cannot_have_both_range_and_list(self): + """Cannot specify both conc range and list.""" + with pytest.raises(Exception) as exc_info: + SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "conc-start": 4, + "conc-end": 64, + "conc-list": [4, 8, 16], + }) + assert "Cannot specify both" in str(exc_info.value) + + def test_must_have_range_or_list(self): + """Must specify either conc range or list.""" + with pytest.raises(Exception) as exc_info: + SingleNodeSearchSpaceEntry(**{ + "tp": 8, + }) + assert "Must specify either" in str(exc_info.value) + def test_conc_start_must_be_lte_conc_end(self): + """conc-start must be <= conc-end.""" + with pytest.raises(Exception) as exc_info: + SingleNodeSearchSpaceEntry(**{ + "tp": 8, + "conc-start": 64, + "conc-end": 4, + }) + assert "must be <=" in str(exc_info.value) + + def test_conc_list_values_must_be_positive(self): + """conc-list values must be > 0.""" + with pytest.raises(Exception) as exc_info: + SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "conc-list": [4, 0, 16], + }) + assert "must be greater than 0" in str(exc_info.value) -# ============================================================================ -# Tests for validate_matrix_output - Single Node -# ============================================================================ + def test_optional_fields_defaults(self): + """Optional fields should have correct defaults.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 8, + "conc-list": [4, 8], + }) + assert entry.ep is None + assert entry.dp_attn is None + assert entry.spec_decoding == "none" -class TestValidateMatrixOutputSingleNode: - """Tests for validate_matrix_output with single-node entries.""" + def test_with_ep_and_dp_attn(self): + """Entry with ep and dp-attn like b200-sglang config.""" + entry = SingleNodeSearchSpaceEntry(**{ + "tp": 4, + "ep": 4, + "dp-attn": True, + "conc-start": 4, + "conc-end": 128, + }) + assert entry.ep == 4 + assert entry.dp_attn is True - def test_valid_single_node_entry(self): - """Test validation of a valid single-node matrix entry.""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2248, - "exp-name": "test_1k1k", - "disagg": False - } - result = validate_matrix_output(entry, is_multinode=False) - assert result == entry - - def test_single_node_missing_field(self): - """Test validation fails when required field is missing.""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, - # Missing tp - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2248, - "exp-name": "test_1k1k", - "disagg": False - } - with pytest.raises(ValueError, match="failed validation"): - validate_matrix_output(entry, is_multinode=False) - - def test_single_node_wrong_type(self): - """Test validation fails when field has wrong type.""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": "not-an-int", - "osl": 1024, - "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2248, - "exp-name": "test_1k1k", - "disagg": False - } - with pytest.raises(ValueError, match="failed validation"): - validate_matrix_output(entry, is_multinode=False) - - def test_single_node_extra_field(self): - """Test validation fails when extra field is present.""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp8", - "framework": "vllm", - "runner": "h200", - "isl": 1024, - "osl": 1024, + def test_with_spec_decoding_mtp(self): + """Entry with mtp spec decoding.""" + entry = SingleNodeSearchSpaceEntry(**{ "tp": 8, - "ep": 1, - "dp-attn": False, - "conc": 4, - "max-model-len": 2248, - "exp-name": "test_1k1k", - "disagg": False, - "extra-field": "not-allowed" - } - with pytest.raises(ValueError, match="failed validation"): - validate_matrix_output(entry, is_multinode=False) + "spec-decoding": "mtp", + "conc-list": [1, 2, 4], + }) + assert entry.spec_decoding == "mtp" -# ============================================================================ -# Tests for validate_matrix_output - Multi Node -# ============================================================================ +# ============================================================================= +# Test MultiNodeSearchSpaceEntry +# ============================================================================= -class TestValidateMatrixOutputMultiNode: - """Tests for validate_matrix_output with multinode entries.""" +class TestMultiNodeSearchSpaceEntry: + """Tests for MultiNodeSearchSpaceEntry model.""" - def test_valid_multinode_entry(self): - """Test validation of a valid multinode matrix entry.""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp4", - "framework": "dynamo-trt", - "spec-decoding": "mtp", - "runner": "gb200", - "isl": 1024, - "osl": 1024, + def test_valid_with_conc_list(self): + """Valid multinode search space with list (like gb200 config).""" + entry = MultiNodeSearchSpaceEntry(**{ "prefill": { - "num-worker": 1, + "num-worker": 5, "tp": 4, "ep": 4, - "dp-attn": False + "dp-attn": True, + "additional-settings": ["PREFILL_MAX_NUM_TOKENS=8448"], }, "decode": { - "num-worker": 4, + "num-worker": 1, "tp": 8, "ep": 8, - "dp-attn": False + "dp-attn": True, + "additional-settings": ["DECODE_MAX_NUM_TOKENS=256"], }, - "conc": [1, 2, 4, 8], - "max-model-len": 2248, - "exp-name": "test_1k1k", - "disagg": True - } - result = validate_matrix_output(entry, is_multinode=True) - assert result == entry - - def test_multinode_missing_spec_decoding(self): - """Test validation fails when spec-decoding is missing.""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp4", - "framework": "dynamo-trt", - "runner": "gb200", - "isl": 1024, - "osl": 1024, + "conc-list": [2150], + }) + assert entry.prefill.num_worker == 5 + assert entry.decode.tp == 8 + + def test_valid_with_conc_range(self): + """Valid multinode search space with range.""" + entry = MultiNodeSearchSpaceEntry(**{ "prefill": { "num-worker": 1, "tp": 4, "ep": 4, - "dp-attn": False + "dp-attn": False, }, "decode": { "num-worker": 4, "tp": 8, "ep": 8, - "dp-attn": False + "dp-attn": False, }, - "conc": [1, 2, 4, 8], - "max-model-len": 2248, - "exp-name": "test_1k1k", - "disagg": True - } - with pytest.raises(ValueError, match="failed validation"): - validate_matrix_output(entry, is_multinode=True) - - def test_multinode_conc_not_list(self): - """Test validation fails when conc is not a list in multinode entry.""" - entry = { - "image": "test:latest", - "model": "test/model", - "precision": "fp4", - "framework": "dynamo-trt", + "conc-start": 1, + "conc-end": 64, + }) + assert entry.conc_start == 1 + assert entry.conc_end == 64 + + def test_with_spec_decoding_mtp(self): + """Multinode entry with mtp spec decoding.""" + entry = MultiNodeSearchSpaceEntry(**{ "spec-decoding": "mtp", - "runner": "gb200", - "isl": 1024, - "osl": 1024, "prefill": { "num-worker": 1, "tp": 4, "ep": 4, - "dp-attn": False + "dp-attn": False, }, "decode": { "num-worker": 4, "tp": 8, "ep": 8, - "dp-attn": False + "dp-attn": False, }, - "conc": 4, # Should be a list - "max-model-len": 2248, - "exp-name": "test_1k1k", - "disagg": True - } - with pytest.raises(ValueError, match="failed validation"): - validate_matrix_output(entry, is_multinode=True) + "conc-list": [1, 2, 4, 8, 16, 36], + }) + assert entry.spec_decoding == "mtp" + def test_missing_conc_specification(self): + """Missing conc specification should fail.""" + with pytest.raises(Exception): + MultiNodeSearchSpaceEntry(**{ + "prefill": { + "num-worker": 2, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 2, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + # Missing conc specification + }) + + +# ============================================================================= +# Test SeqLenConfig models +# ============================================================================= + +class TestSeqLenConfigs: + """Tests for sequence length config models.""" + + def test_single_node_seq_len_config_1k1k(self): + """Valid single node seq len config for 1k/1k.""" + config = SingleNodeSeqLenConfig(**{ + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }) + assert config.isl == 1024 + assert config.osl == 1024 + assert len(config.search_space) == 1 + + def test_single_node_seq_len_config_8k1k(self): + """Valid single node seq len config for 8k/1k.""" + config = SingleNodeSeqLenConfig(**{ + "isl": 8192, + "osl": 1024, + "search-space": [ + {"tp": 8, "conc-start": 4, "conc-end": 64} + ] + }) + assert config.isl == 8192 + assert config.osl == 1024 -# ============================================================================ -# Tests for validate_runner_config -# ============================================================================ + def test_multinode_seq_len_config(self): + """Valid multinode seq len config.""" + config = MultiNodeSeqLenConfig(**{ + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "prefill": { + "num-worker": 5, + "tp": 4, + "ep": 4, + "dp-attn": True, + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": True, + }, + "conc-list": [2150], + } + ] + }) + assert config.isl == 1024 + assert config.osl == 1024 + + +# ============================================================================= +# Test MasterConfigEntry models +# ============================================================================= + +class TestMasterConfigEntries: + """Tests for master config entry models.""" + + def test_single_node_master_config(self, valid_single_node_master_config): + """Valid single node master config.""" + config = SingleNodeMasterConfigEntry(**valid_single_node_master_config) + assert config.multinode is False + assert config.model_prefix == "dsr1" + assert config.runner == "mi300x" + assert config.framework == "sglang" + + def test_multinode_master_config(self, valid_multinode_master_config): + """Valid multinode master config.""" + config = MultiNodeMasterConfigEntry(**valid_multinode_master_config) + assert config.multinode is True + assert config.model_prefix == "dsr1" + assert config.runner == "gb200" + assert config.disagg is True + + def test_single_node_cannot_have_multinode_true(self, valid_single_node_master_config): + """Single node config must have multinode=False.""" + valid_single_node_master_config["multinode"] = True + with pytest.raises(Exception): + SingleNodeMasterConfigEntry(**valid_single_node_master_config) + + def test_multinode_cannot_have_multinode_false(self, valid_multinode_master_config): + """Multinode config must have multinode=True.""" + valid_multinode_master_config["multinode"] = False + with pytest.raises(Exception): + MultiNodeMasterConfigEntry(**valid_multinode_master_config) + + def test_disagg_default_false(self, valid_single_node_master_config): + """Disagg should default to False.""" + config = SingleNodeMasterConfigEntry(**valid_single_node_master_config) + assert config.disagg is False + + +# ============================================================================= +# Test validate_master_config function +# ============================================================================= + +class TestValidateMasterConfig: + """Tests for validate_master_config function.""" + + def test_valid_single_node_config(self, valid_single_node_master_config): + """Valid single node config should pass.""" + configs = {"dsr1-fp8-mi300x-sglang": valid_single_node_master_config} + result = validate_master_config(configs) + assert result == configs + + def test_valid_multinode_config(self, valid_multinode_master_config): + """Valid multinode config should pass.""" + configs = {"dsr1-fp4-gb200-dynamo-trt": valid_multinode_master_config} + result = validate_master_config(configs) + assert result == configs + + def test_mixed_configs(self, valid_single_node_master_config, valid_multinode_master_config): + """Mixed single and multinode configs should pass.""" + configs = { + "dsr1-fp8-mi300x-sglang": valid_single_node_master_config, + "dsr1-fp4-gb200-dynamo-trt": valid_multinode_master_config, + } + result = validate_master_config(configs) + assert len(result) == 2 + + def test_invalid_config_raises_valueerror(self, valid_single_node_master_config): + """Invalid config should raise ValueError with key name.""" + del valid_single_node_master_config["model"] + configs = {"broken-config": valid_single_node_master_config} + with pytest.raises(ValueError) as exc_info: + validate_master_config(configs) + assert "broken-config" in str(exc_info.value) + assert "failed validation" in str(exc_info.value) + + +# ============================================================================= +# Test validate_runner_config function +# ============================================================================= class TestValidateRunnerConfig: """Tests for validate_runner_config function.""" - def test_valid_runner_config(self): - """Test validation of a valid runner config.""" - config = { - "h200": ["h200-nv_1", "h200-nv_2"], - "b200": ["b200-nv_1"], - "gb200": ["gb200-nv_1", "gb200-nv_2", "gb200-nv_3"] - } - result = validate_runner_config(config) - assert result == config + def test_valid_runner_config(self, valid_runner_config): + """Valid runner config should pass.""" + result = validate_runner_config(valid_runner_config) + assert result == valid_runner_config - def test_runner_config_value_not_list(self): - """Test validation fails when runner value is not a list.""" + def test_value_must_be_list(self): + """Runner config values must be lists.""" config = { - "h200": "h200-nv_1" # Should be a list + "h100": "h100-cr_0", # Not a list } - with pytest.raises(ValueError, match="must be a list"): + with pytest.raises(ValueError) as exc_info: validate_runner_config(config) + assert "must be a list" in str(exc_info.value) - def test_runner_config_list_not_strings(self): - """Test validation fails when list contains non-strings.""" + def test_list_must_contain_strings(self): + """Runner config lists must contain only strings.""" config = { - "h200": ["h200-nv_1", 123] # Contains non-string + "h100": ["h100-cr_0", 123], # Contains non-string } - with pytest.raises(ValueError, match="must contain only strings"): + with pytest.raises(ValueError) as exc_info: validate_runner_config(config) + assert "must contain only strings" in str(exc_info.value) - def test_runner_config_empty_list(self): - """Test validation fails when runner list is empty.""" + def test_list_cannot_be_empty(self): + """Runner config lists cannot be empty.""" config = { - "h200": [] # Empty list + "mi355x": [], } - with pytest.raises(ValueError, match="cannot be an empty list"): + with pytest.raises(ValueError) as exc_info: validate_runner_config(config) - - -# ============================================================================ -# Tests for Pydantic Models - Unit Tests -# ============================================================================ - -class TestWorkerConfigModel: - """Tests for WorkerConfig Pydantic model.""" - - def test_valid_worker_config(self): - """Test valid WorkerConfig.""" - config = WorkerConfig(**{ - "num-worker": 4, - "tp": 8, - "ep": 8, - "dp-attn": False, - "additional-settings": ["SETTING1=value1"] - }) - assert config.num_worker == 4 - assert config.tp == 8 - - def test_worker_config_without_additional_settings(self): - """Test WorkerConfig without additional-settings.""" - config = WorkerConfig(**{ - "num-worker": 4, - "tp": 8, - "ep": 8, - "dp-attn": False - }) - assert config.additional_settings is None - - -class TestSingleNodeSearchSpaceEntry: - """Tests for SingleNodeSearchSpaceEntry Pydantic model.""" - - def test_valid_with_range(self): - """Test valid entry with conc-start/conc-end.""" - entry = SingleNodeSearchSpaceEntry(**{ - "tp": 4, - "conc-start": 4, - "conc-end": 64 - }) - assert entry.tp == 4 - assert entry.conc_start == 4 - assert entry.conc_end == 64 - - def test_valid_with_list(self): - """Test valid entry with conc-list.""" - entry = SingleNodeSearchSpaceEntry(**{ - "tp": 4, - "conc-list": [1, 2, 4, 8] - }) - assert entry.tp == 4 - assert entry.conc_list == [1, 2, 4, 8] - - def test_valid_with_optional_fields(self): - """Test valid entry with optional fields.""" - entry = SingleNodeSearchSpaceEntry(**{ - "tp": 4, - "ep": 2, - "dp-attn": True, - "spec-decoding": "mtp", - "conc-start": 4, - "conc-end": 64 - }) - assert entry.ep == 2 - assert entry.dp_attn == True - assert entry.spec_decoding == "mtp" - - -class TestMultiNodeSearchSpaceEntry: - """Tests for MultiNodeSearchSpaceEntry Pydantic model.""" - - def test_valid_multinode_entry(self): - """Test valid multinode search-space entry.""" - entry = MultiNodeSearchSpaceEntry(**{ - "conc-list": [1, 2, 4], - "prefill": { - "num-worker": 1, - "tp": 4, - "ep": 4, - "dp-attn": False - }, - "decode": { - "num-worker": 4, - "tp": 8, - "ep": 8, - "dp-attn": False - } - }) - assert entry.prefill.num_worker == 1 - assert entry.decode.num_worker == 4 - - -# ============================================================================ -# Tests for Fields Enum -# ============================================================================ - -class TestFieldsEnum: - """Tests for the Fields enum.""" - - def test_field_values(self): - """Test that Fields enum has expected values.""" - assert Fields.IMAGE.value == 'image' - assert Fields.MODEL.value == 'model' - assert Fields.MODEL_PREFIX.value == 'model-prefix' - assert Fields.PRECISION.value == 'precision' - assert Fields.FRAMEWORK.value == 'framework' - assert Fields.RUNNER.value == 'runner' - assert Fields.SEQ_LEN_CONFIGS.value == 'seq-len-configs' - assert Fields.MULTINODE.value == 'multinode' - assert Fields.ISL.value == 'isl' - assert Fields.OSL.value == 'osl' - assert Fields.SEARCH_SPACE.value == 'search-space' - assert Fields.TP.value == 'tp' - assert Fields.EP.value == 'ep' - assert Fields.CONC_START.value == 'conc-start' - assert Fields.CONC_END.value == 'conc-end' - assert Fields.CONC_LIST.value == 'conc-list' - assert Fields.DP_ATTN.value == 'dp-attn' - assert Fields.DISAGG.value == 'disagg' - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) + assert "cannot be an empty list" in str(exc_info.value) + + def test_multiple_runner_types(self, valid_runner_config): + """Multiple runner types should work.""" + result = validate_runner_config(valid_runner_config) + assert "h100" in result + assert "h200" in result + assert "mi300x" in result + assert "gb200" in result diff --git a/utils/matrix-logic/validation.py b/utils/matrix-logic/validation.py index 25f83cc38..496d5003f 100644 --- a/utils/matrix-logic/validation.py +++ b/utils/matrix-logic/validation.py @@ -4,6 +4,11 @@ import pprint +""" + The below class defines the field names expected to be present in the JSON entries + for both single-node and multi-node configurations. +""" + class Fields(Enum): # Field name constants @@ -46,6 +51,16 @@ class Fields(Enum): DISAGG = 'disagg' +""" + Below is the validation logic for the OUTPUT of utils/matrix-logic/generate_sweep_configs.py, i.e., + the input to the actual workflow files. The validation enforces a strict set of rules on the structure + of the generated matrix entries to ensure correctness before proceeding with benchmarking. This ensures + that no validation has to happen in the workflow itself, i.e., at runtime, it is assumed that all inputs + are valid. Threfore, there should not be any default values set in these Pydantic models. Any missing value + should raise a validation error. +""" + + class SingleNodeMatrixEntry(BaseModel): """Pydantic model for validating single node matrix entry structure. This validates the input that should be expected to .github/workflows/benchmark-tmpl.yml""" @@ -79,7 +94,7 @@ class WorkerConfig(BaseModel): ep: int dp_attn: bool = Field(alias=Fields.DP_ATTN.value) additional_settings: Optional[List[str]] = Field( - default=None, alias=Fields.ADDITIONAL_SETTINGS.value) + default=[], alias=Fields.ADDITIONAL_SETTINGS.value) class MultiNodeMatrixEntry(BaseModel): @@ -105,7 +120,7 @@ class MultiNodeMatrixEntry(BaseModel): disagg: bool -def validate_matrix_output(entry: dict, is_multinode: bool) -> dict: +def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict: """Validate that matrix_values entries match the expected structure. Raises ValueError if any entry fails validation. @@ -121,7 +136,13 @@ def validate_matrix_output(entry: dict, is_multinode: bool) -> dict: f"The following parsed matrix entry failed validation:\n{pprint.pformat(entry)}\n{e}") return entry -# Input Master Config Validation + +""" + Below is the validation logic for the INPUT to utils/matrix-logic/generate_sweep_configs.py, i.e., + the master configuration files found in .github/configs. The validation enforces a strict set of + rules on the structure of the master configuration files to ensure correctness before proceeding + with matrix generation. +""" def _validate_conc_fields(self): diff --git a/utils/scrape_image_tag.py b/utils/scrape_image_tag.py deleted file mode 100644 index f45b5f9fd..000000000 --- a/utils/scrape_image_tag.py +++ /dev/null @@ -1,43 +0,0 @@ -import sys -import requests - -repository = sys.argv[1] -auth_url = f'https://auth.docker.io/token?service=registry.docker.io&scope=repository:{repository}:pull' -token = requests.get(auth_url).json()['token'] - -tags_url = f'https://registry-1.docker.io/v2/{repository}/tags/list' -resp = requests.get(tags_url, headers={'Authorization': f'Bearer {token}'}) -resp.raise_for_status() - -vllm_tags = resp.json()['tags'] -valid_tags = [tag for tag in vllm_tags if tag.startswith(sys.argv[2]) and 'rc' not in tag] - - -def make_key_cuda(tag): - ''' - Tag format: vX.Y.Z(.W) - X, Y, Z are numbers - W can be a number or string postN (N is a number) - ''' - vals = tag[1:].split('.') - post = vals[3] if len(vals) == 4 else '0' - key = (int(vals[0]), int(vals[1]), int(vals[2]), post) - return key - -def make_key_rocm(tag): - *_, date = tag.split('_') - try: - key = int(date) - except: - key = -1 - return key - -if repository == 'vllm/vllm-openai': - make_key_fn = make_key_cuda -elif repository == 'rocm/vllm': - make_key_fn = make_key_rocm -else: - raise ValueError(f'Invalid repo {repository}') - -tag = max(valid_tags, key=make_key_fn) -print(f'{repository}:{tag}') diff --git a/utils/summarize.py b/utils/summarize.py index 75cf6610b..30c54e18f 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -13,8 +13,10 @@ single_node_results = [r for r in results if not r['is_multinode']] multinode_results = [r for r in results if r['is_multinode']] +# Single-node and multi-node results have different field and therefore need to be printed separately if single_node_results: - single_node_results.sort(key=lambda r: (r['model'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'])) + single_node_results.sort(key=lambda r: ( + r['model'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'])) print("## Single-Node Results\n") single_node_header = '''\ @@ -43,11 +45,12 @@ f"| {result['output_tput_per_gpu']:.4f} " f"| {result['input_tput_per_gpu']:.4f} |" ) - + print("\n") if multinode_results: - multinode_results.sort(key=lambda r: (r['model'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc'])) + multinode_results.sort(key=lambda r: (r['model'], r['hw'], r['framework'], r['precision'], r['isl'], + r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc'])) print("## Multi-Node Results\n") multinode_header = '''\ diff --git a/utils/test_process_result.py b/utils/test_process_result.py deleted file mode 100644 index bb1765acf..000000000 --- a/utils/test_process_result.py +++ /dev/null @@ -1,150 +0,0 @@ -import pytest -import json -import sys -import importlib.util -from pathlib import Path -from io import StringIO - - -def create_mock_result_file(tmp_path): - """Create a mock result JSON file.""" - result_data = { - "max_concurrency": 10, - "model_id": "test-model", - "total_token_throughput": 1000.0, - "output_throughput": 400.0, - "ttft_ms": 50.0, - "tpot_ms": 20.0 - } - result_file = tmp_path / "test_result.json" - with open(result_file, 'w') as f: - json.dump(result_data, f) - return result_file - - -def run_process_result_script(tmp_path): - """Helper to run process_result.py and return the output data.""" - # Create mock result file - create_mock_result_file(tmp_path) - - # Get script path relative to this test file - script_path = Path(__file__).parent / "process_result.py" - spec = importlib.util.spec_from_file_location("process_result", script_path) - module = importlib.util.module_from_spec(spec) - - # Capture stdout - old_stdout = sys.stdout - sys.stdout = StringIO() - - try: - spec.loader.exec_module(module) - output = sys.stdout.getvalue() - return json.loads(output) - finally: - sys.stdout = old_stdout - - -def test_disagg_true_when_both_env_vars_set(tmp_path, monkeypatch): - """Test that disagg=true when both PREFILL_GPUS and DECODE_GPUS are set.""" - # Set environment variables - monkeypatch.setenv('RUNNER_TYPE', 'h200') - monkeypatch.setenv('TP', '8') - monkeypatch.setenv('EP_SIZE', '1') - monkeypatch.setenv('PREFILL_GPUS', '4') - monkeypatch.setenv('DECODE_GPUS', '4') - monkeypatch.setenv('DP_ATTENTION', 'false') - monkeypatch.setenv('RESULT_FILENAME', 'test_result') - monkeypatch.setenv('FRAMEWORK', 'vllm') - monkeypatch.setenv('PRECISION', 'fp8') - - # Change to tmp_path directory - monkeypatch.chdir(tmp_path) - - # Run the script and get output - data = run_process_result_script(tmp_path) - - # Check that disagg is true - assert data['disagg'] is True - # Check that num_prefill_gpu and num_decode_gpu are present - assert data['num_prefill_gpu'] == 4 - assert data['num_decode_gpu'] == 4 - - -def test_disagg_false_when_prefill_gpus_not_set(tmp_path, monkeypatch): - """Test that disagg=false when PREFILL_GPUS is not set.""" - # Set environment variables (without PREFILL_GPUS) - monkeypatch.setenv('RUNNER_TYPE', 'h200') - monkeypatch.setenv('TP', '8') - monkeypatch.setenv('EP_SIZE', '1') - monkeypatch.setenv('DECODE_GPUS', '4') - monkeypatch.setenv('DP_ATTENTION', 'false') - monkeypatch.setenv('RESULT_FILENAME', 'test_result') - monkeypatch.setenv('FRAMEWORK', 'vllm') - monkeypatch.setenv('PRECISION', 'fp8') - - # Change to tmp_path directory - monkeypatch.chdir(tmp_path) - - # Run the script and get output - data = run_process_result_script(tmp_path) - - # Check that disagg is false - assert data['disagg'] is False - # Check that num_prefill_gpu and num_decode_gpu are NOT present - assert 'num_prefill_gpu' not in data - assert 'num_decode_gpu' not in data - - -def test_disagg_false_when_decode_gpus_not_set(tmp_path, monkeypatch): - """Test that disagg=false when DECODE_GPUS is not set.""" - # Set environment variables (without DECODE_GPUS) - monkeypatch.setenv('RUNNER_TYPE', 'h200') - monkeypatch.setenv('TP', '8') - monkeypatch.setenv('EP_SIZE', '1') - monkeypatch.setenv('PREFILL_GPUS', '4') - monkeypatch.setenv('DP_ATTENTION', 'false') - monkeypatch.setenv('RESULT_FILENAME', 'test_result') - monkeypatch.setenv('FRAMEWORK', 'vllm') - monkeypatch.setenv('PRECISION', 'fp8') - - # Change to tmp_path directory - monkeypatch.chdir(tmp_path) - - # Run the script and get output - data = run_process_result_script(tmp_path) - - # Check that disagg is false - assert data['disagg'] is False - # Check that num_prefill_gpu and num_decode_gpu are NOT present - assert 'num_prefill_gpu' not in data - assert 'num_decode_gpu' not in data - - -def test_disagg_false_when_both_env_vars_empty_strings(tmp_path, monkeypatch): - """Test that disagg=false when both PREFILL_GPUS and DECODE_GPUS are empty strings.""" - # Set environment variables with empty strings - monkeypatch.setenv('RUNNER_TYPE', 'h200') - monkeypatch.setenv('TP', '8') - monkeypatch.setenv('EP_SIZE', '1') - monkeypatch.setenv('PREFILL_GPUS', '') - monkeypatch.setenv('DECODE_GPUS', '') - monkeypatch.setenv('DP_ATTENTION', 'false') - monkeypatch.setenv('RESULT_FILENAME', 'test_result') - monkeypatch.setenv('FRAMEWORK', 'vllm') - monkeypatch.setenv('PRECISION', 'fp8') - - # Change to tmp_path directory - monkeypatch.chdir(tmp_path) - - # Run the script and get output - data = run_process_result_script(tmp_path) - - # Check that disagg is false - assert data['disagg'] is False - # Check that num_prefill_gpu and num_decode_gpu are NOT present - assert 'num_prefill_gpu' not in data - assert 'num_decode_gpu' not in data - - -if __name__ == "__main__": - pytest.main([__file__, "-v"]) From 2379449faefbe1f7fffad82c7f066eb4d985f432 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 1 Dec 2025 09:46:42 -0600 Subject: [PATCH 66/98] adding tests --- .../test_generate_sweep_configs.py | 2 - utils/test_process_result.py | 488 ++++++++++++++++++ 2 files changed, 488 insertions(+), 2 deletions(-) create mode 100644 utils/test_process_result.py diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py index ba45f2ab5..292ad3189 100644 --- a/utils/matrix-logic/test_generate_sweep_configs.py +++ b/utils/matrix-logic/test_generate_sweep_configs.py @@ -1,8 +1,6 @@ """Comprehensive tests for generate_sweep_configs.py""" import pytest -import json import argparse -from unittest.mock import patch, mock_open from generate_sweep_configs import ( seq_len_stoi, seq_len_itos, diff --git a/utils/test_process_result.py b/utils/test_process_result.py new file mode 100644 index 000000000..e223e256a --- /dev/null +++ b/utils/test_process_result.py @@ -0,0 +1,488 @@ +"""Comprehensive tests for process_result.py + +Since process_result.py executes code at module import time, we test it by: +1. Testing the get_required_env_vars function directly +2. Running the script as a subprocess with mocked environment and files +""" +import pytest +import json +import subprocess +import sys +from pathlib import Path + +SCRIPT_PATH = Path(__file__).parent / "process_result.py" + + +# ============================================================================= +# Test Fixtures - Based on real benchmark output structure +# ============================================================================= + +@pytest.fixture +def sample_benchmark_result(): + """Sample benchmark result JSON based on real output structure.""" + return { + "model_id": "deepseek-ai/DeepSeek-R1-0528", + "max_concurrency": 64, + "total_token_throughput": 15000.5, + "output_throughput": 12000.0, + "ttft_p50_ms": 150.5, + "ttft_p99_ms": 250.3, + "tpot_p50_ms": 25.0, + "tpot_p99_ms": 45.0, + "e2e_latency_p50_ms": 1500.0, + "e2e_latency_p99_ms": 2500.0, + } + + +@pytest.fixture +def base_env_vars(): + """Base environment variables for single-node setup.""" + return { + "RUNNER_TYPE": "mi300x", + "FRAMEWORK": "sglang", + "PRECISION": "fp8", + "SPEC_DECODING": "none", + "RESULT_FILENAME": "benchmark_result", + "ISL": "1024", + "OSL": "1024", + "DISAGG": "false", + } + + +@pytest.fixture +def single_node_env_vars(base_env_vars): + """Environment variables for single-node setup.""" + return { + **base_env_vars, + "TP": "8", + "EP_SIZE": "1", + "DP_ATTENTION": "false", + } + + +@pytest.fixture +def multinode_env_vars(base_env_vars): + """Environment variables for multinode setup based on gb200 config.""" + return { + **base_env_vars, + "RUNNER_TYPE": "gb200", + "FRAMEWORK": "dynamo-trt", + "PRECISION": "fp4", + "DISAGG": "true", + "IS_MULTINODE": "true", + "PREFILL_GPUS": "20", + "DECODE_GPUS": "8", + "PREFILL_NUM_WORKERS": "5", + "PREFILL_TP": "4", + "PREFILL_EP": "4", + "PREFILL_DP_ATTN": "true", + "DECODE_NUM_WORKERS": "1", + "DECODE_TP": "8", + "DECODE_EP": "8", + "DECODE_DP_ATTN": "true", + } + + +def run_script(tmp_path, env, benchmark_result, result_filename="benchmark_result"): + """Helper to run the process_result.py script.""" + result_file = tmp_path / f"{result_filename}.json" + result_file.write_text(json.dumps(benchmark_result)) + + env = env.copy() + env["RESULT_FILENAME"] = result_filename + + return subprocess.run( + [sys.executable, str(SCRIPT_PATH)], + cwd=tmp_path, + env=env, + capture_output=True, + text=True, + ) + + +# ============================================================================= +# Test get_required_env_vars function +# ============================================================================= + +class TestGetRequiredEnvVars: + """Tests for get_required_env_vars function.""" + + def test_all_vars_present(self, monkeypatch): + """Should return dict when all vars present.""" + monkeypatch.setenv("TEST_VAR_1", "value1") + monkeypatch.setenv("TEST_VAR_2", "value2") + + import os + + def get_required_env_vars(required_vars): + env_values = {} + missing_env_vars = [] + for var_name in required_vars: + value = os.environ.get(var_name) + if value is None: + missing_env_vars.append(var_name) + env_values[var_name] = value + if missing_env_vars: + raise EnvironmentError( + f"Missing required environment variables: {', '.join(missing_env_vars)}") + return env_values + + result = get_required_env_vars(["TEST_VAR_1", "TEST_VAR_2"]) + assert result["TEST_VAR_1"] == "value1" + assert result["TEST_VAR_2"] == "value2" + + def test_missing_vars_raises_error(self, monkeypatch): + """Should raise EnvironmentError when vars missing.""" + import os + + def get_required_env_vars(required_vars): + env_values = {} + missing_env_vars = [] + for var_name in required_vars: + value = os.environ.get(var_name) + if value is None: + missing_env_vars.append(var_name) + env_values[var_name] = value + if missing_env_vars: + raise EnvironmentError( + f"Missing required environment variables: {', '.join(missing_env_vars)}") + return env_values + + monkeypatch.delenv("NONEXISTENT_VAR", raising=False) + + with pytest.raises(EnvironmentError) as exc_info: + get_required_env_vars(["NONEXISTENT_VAR"]) + assert "NONEXISTENT_VAR" in str(exc_info.value) + + +# ============================================================================= +# Test script execution via subprocess +# ============================================================================= + +class TestProcessResultScript: + """Tests for process_result.py script execution.""" + + def test_single_node_processing(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test single-node result processing.""" + result = run_script(tmp_path, single_node_env_vars, sample_benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + + # Verify base fields + assert output_data["hw"] == "mi300x" + assert output_data["framework"] == "sglang" + assert output_data["precision"] == "fp8" + assert output_data["spec_decoding"] == "none" + assert output_data["model"] == "deepseek-ai/DeepSeek-R1-0528" + assert output_data["conc"] == 64 + assert output_data["isl"] == 1024 + assert output_data["osl"] == 1024 + assert output_data["disagg"] is False + + # Verify single-node specific fields + assert output_data["is_multinode"] is False + assert output_data["tp"] == 8 + assert output_data["ep"] == 1 + assert output_data["dp_attention"] == "false" + + # Verify throughput calculations (divided by tp=8) + assert output_data["tput_per_gpu"] == pytest.approx(15000.5 / 8) + assert output_data["output_tput_per_gpu"] == pytest.approx(12000.0 / 8) + assert output_data["input_tput_per_gpu"] == pytest.approx((15000.5 - 12000.0) / 8) + + # Verify latency conversions (ms to seconds) + assert output_data["ttft_p50"] == pytest.approx(0.1505) + assert output_data["ttft_p99"] == pytest.approx(0.2503) + assert output_data["e2e_latency_p50"] == pytest.approx(1.5) + assert output_data["e2e_latency_p99"] == pytest.approx(2.5) + + # Verify interactivity calculations (1000 / tpot_ms) + assert output_data["intvty_p50"] == pytest.approx(1000.0 / 25.0) + assert output_data["intvty_p99"] == pytest.approx(1000.0 / 45.0) + + # Verify output file created + output_file = tmp_path / "agg_benchmark_result.json" + assert output_file.exists() + + def test_multinode_processing(self, tmp_path, sample_benchmark_result, multinode_env_vars): + """Test multinode result processing.""" + result = run_script(tmp_path, multinode_env_vars, sample_benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + + # Verify base fields + assert output_data["hw"] == "gb200" + assert output_data["framework"] == "dynamo-trt" + assert output_data["precision"] == "fp4" + assert output_data["disagg"] is True + + # Verify multinode specific fields + assert output_data["is_multinode"] is True + assert output_data["prefill_tp"] == 4 + assert output_data["prefill_ep"] == 4 + assert output_data["prefill_dp_attention"] == "true" + assert output_data["prefill_num_workers"] == 5 + assert output_data["decode_tp"] == 8 + assert output_data["decode_ep"] == 8 + assert output_data["decode_dp_attention"] == "true" + assert output_data["decode_num_workers"] == 1 + assert output_data["num_prefill_gpu"] == 20 + assert output_data["num_decode_gpu"] == 8 + + # Verify throughput calculations + total_gpus = 20 + 8 # prefill + decode + assert output_data["tput_per_gpu"] == pytest.approx(15000.5 / total_gpus) + assert output_data["output_tput_per_gpu"] == pytest.approx(12000.0 / 8) # decode gpus + assert output_data["input_tput_per_gpu"] == pytest.approx((15000.5 - 12000.0) / 20) # prefill gpus + + def test_missing_base_env_vars(self, tmp_path, sample_benchmark_result): + """Test that missing base env vars causes failure.""" + result_file = tmp_path / "benchmark_result.json" + result_file.write_text(json.dumps(sample_benchmark_result)) + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH)], + cwd=tmp_path, + env={"PATH": "/usr/bin", "RESULT_FILENAME": "benchmark_result"}, + capture_output=True, + text=True, + ) + + assert result.returncode != 0 + assert "Missing required environment variables" in result.stderr + + def test_missing_single_node_env_vars(self, tmp_path, sample_benchmark_result, base_env_vars): + """Test that missing single-node env vars causes failure.""" + # base_env_vars doesn't have TP, EP_SIZE, DP_ATTENTION + result = run_script(tmp_path, base_env_vars, sample_benchmark_result) + + assert result.returncode != 0 + assert "Missing required environment variables" in result.stderr + + def test_missing_multinode_env_vars(self, tmp_path, sample_benchmark_result, base_env_vars): + """Test that missing multinode env vars causes failure.""" + env = base_env_vars.copy() + env["IS_MULTINODE"] = "true" + env["DISAGG"] = "true" + # Missing multinode-specific vars + + result = run_script(tmp_path, env, sample_benchmark_result) + + assert result.returncode != 0 + assert "Missing required environment variables" in result.stderr + + def test_disagg_without_multinode_fails(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test that disagg=true without multinode raises error.""" + env = single_node_env_vars.copy() + env["DISAGG"] = "true" # Disagg without multinode + + result = run_script(tmp_path, env, sample_benchmark_result) + + assert result.returncode != 0 + assert "Disaggregated mode requires multinode setup" in result.stderr + + def test_missing_result_file(self, tmp_path, single_node_env_vars): + """Test that missing result file causes failure.""" + env = single_node_env_vars.copy() + env["RESULT_FILENAME"] = "nonexistent" + + result = subprocess.run( + [sys.executable, str(SCRIPT_PATH)], + cwd=tmp_path, + env=env, + capture_output=True, + text=True, + ) + + assert result.returncode != 0 + + +# ============================================================================= +# Test latency and throughput calculations +# ============================================================================= + +class TestCalculations: + """Tests for throughput and latency calculations.""" + + def test_latency_ms_to_seconds_conversion(self, tmp_path, single_node_env_vars): + """Test that _ms fields are converted to seconds.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 8, + "total_token_throughput": 1000.0, + "output_throughput": 800.0, + "custom_metric_ms": 500.0, # Should become custom_metric = 0.5 + } + + result = run_script(tmp_path, single_node_env_vars, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["custom_metric"] == pytest.approx(0.5) + + def test_tpot_to_interactivity_conversion(self, tmp_path, single_node_env_vars): + """Test that tpot fields are converted to interactivity.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 8, + "total_token_throughput": 1000.0, + "output_throughput": 800.0, + "tpot_p50_ms": 20.0, # Should become intvty_p50 = 50 + "tpot_p99_ms": 50.0, # Should become intvty_p99 = 20 + } + + result = run_script(tmp_path, single_node_env_vars, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["intvty_p50"] == pytest.approx(50.0) + assert output_data["intvty_p99"] == pytest.approx(20.0) + + def test_throughput_per_gpu_single_node(self, tmp_path, single_node_env_vars): + """Test throughput per GPU calculation for single node.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 8, + "total_token_throughput": 8000.0, + "output_throughput": 6000.0, + } + + env = single_node_env_vars.copy() + env["TP"] = "4" + + result = run_script(tmp_path, env, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["tput_per_gpu"] == pytest.approx(2000.0) # 8000 / 4 + assert output_data["output_tput_per_gpu"] == pytest.approx(1500.0) # 6000 / 4 + assert output_data["input_tput_per_gpu"] == pytest.approx(500.0) # (8000 - 6000) / 4 + + def test_throughput_per_gpu_multinode(self, tmp_path, multinode_env_vars): + """Test throughput per GPU calculation for multinode.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 64, + "total_token_throughput": 28000.0, # Will be divided by total GPUs + "output_throughput": 16000.0, # Will be divided by decode GPUs + } + + env = multinode_env_vars.copy() + env["PREFILL_GPUS"] = "20" + env["DECODE_GPUS"] = "8" + + result = run_script(tmp_path, env, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["tput_per_gpu"] == pytest.approx(1000.0) # 28000 / 28 + assert output_data["output_tput_per_gpu"] == pytest.approx(2000.0) # 16000 / 8 + assert output_data["input_tput_per_gpu"] == pytest.approx(600.0) # (28000 - 16000) / 20 + + +# ============================================================================= +# Test output file generation +# ============================================================================= + +class TestOutputFile: + """Tests for output file generation.""" + + def test_output_file_created(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test that aggregated output file is created.""" + result = run_script(tmp_path, single_node_env_vars, sample_benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_file = tmp_path / "agg_benchmark_result.json" + assert output_file.exists() + + # Verify content matches stdout + with open(output_file) as f: + file_content = json.load(f) + + stdout_content = json.loads(result.stdout) + assert file_content == stdout_content + + def test_output_file_has_correct_prefix(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test that output file has 'agg_' prefix.""" + result = run_script(tmp_path, single_node_env_vars, sample_benchmark_result, "my_custom_result") + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_file = tmp_path / "agg_my_custom_result.json" + assert output_file.exists() + + +# ============================================================================= +# Test edge cases +# ============================================================================= + +class TestEdgeCases: + """Tests for edge cases and special scenarios.""" + + def test_boolean_disagg_parsing_false(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test that DISAGG env var is parsed as boolean correctly for false values.""" + for disagg_value in ["false", "False", "FALSE"]: + env = single_node_env_vars.copy() + env["DISAGG"] = disagg_value + + result = run_script(tmp_path, env, sample_benchmark_result) + assert result.returncode == 0, f"Script failed for DISAGG={disagg_value}: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["disagg"] is False + + def test_boolean_disagg_parsing_true_requires_multinode(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test that DISAGG=true without multinode fails.""" + for disagg_value in ["true", "True", "TRUE"]: + env = single_node_env_vars.copy() + env["DISAGG"] = disagg_value + + result = run_script(tmp_path, env, sample_benchmark_result) + assert result.returncode != 0 + + def test_is_multinode_default_false(self, tmp_path, sample_benchmark_result, single_node_env_vars): + """Test that IS_MULTINODE defaults to false when not set.""" + # Don't set IS_MULTINODE + result = run_script(tmp_path, single_node_env_vars, sample_benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["is_multinode"] is False + + def test_integer_conversion(self, tmp_path, single_node_env_vars): + """Test that numeric env vars are converted to integers.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 32, + "total_token_throughput": 5000.0, + "output_throughput": 4000.0, + } + + env = single_node_env_vars.copy() + env["ISL"] = "8192" + env["OSL"] = "1024" + + result = run_script(tmp_path, env, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["isl"] == 8192 + assert output_data["osl"] == 1024 + assert isinstance(output_data["isl"], int) + assert isinstance(output_data["osl"], int) + + def test_conc_from_benchmark_result(self, tmp_path, single_node_env_vars): + """Test that conc is read from benchmark result max_concurrency.""" + benchmark_result = { + "model_id": "test-model", + "max_concurrency": 128, + "total_token_throughput": 5000.0, + "output_throughput": 4000.0, + } + + result = run_script(tmp_path, single_node_env_vars, benchmark_result) + assert result.returncode == 0, f"Script failed: {result.stderr}" + + output_data = json.loads(result.stdout) + assert output_data["conc"] == 128 From 9b3a6806352d67cd8902817a18514b5e8faf7eae Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 1 Dec 2025 09:57:30 -0600 Subject: [PATCH 67/98] adding tests --- .github/workflows/label-validation.yml | 3 +- utils/matrix-logic/generate_sweep_configs.py | 40 ++- .../test_generate_sweep_configs.py | 247 +++++++++++++++++- 3 files changed, 276 insertions(+), 14 deletions(-) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 46e8099ee..84fa6e891 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -58,7 +58,6 @@ jobs: '--runner-type', label['runner-type'], '--model-prefix', label['model-prefix'], '--seq-lens', '1k1k', - '--test-mode', '--config-files', f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/nvidia-master.yaml", f"{os.environ['GITHUB_WORKSPACE']}/.github/configs/amd-master.yaml", @@ -102,6 +101,8 @@ jobs: ep: ${{ matrix.config.ep }} dp-attn: ${{ matrix.config.dp-attn }} conc: ${{ matrix.config.conc }} + spec-decoding: ${{ matrix.config.spec-decoding }} + disagg: ${{ matrix.config.disagg }} collect-results: needs: validate diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index ca9ce6291..cccdae5ae 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -125,10 +125,17 @@ def generate_full_sweep(args, all_config_data, runner_data): conc = conc_end # Apply max-conc filter if specified + # If max_conc is less than all values, use max_conc directly (if valid) if args.max_conc is not None: - conc_values = [c for c in conc_values if c <= args.max_conc] - if not conc_values: - continue # Skip this bmk if no concurrency values remain + filtered_conc = [c for c in conc_values if c <= args.max_conc] + if not filtered_conc: + # No existing values <= max_conc, so use max_conc directly if valid + if args.max_conc > 0: + conc_values = [args.max_conc] + else: + continue # Skip if max_conc is not positive + else: + conc_values = filtered_conc # For multinode, create a single entry with conc as a list seq_len_str = seq_len_to_str(isl, osl) @@ -161,18 +168,31 @@ def generate_full_sweep(args, all_config_data, runner_data): spec_decoding = bmk.get(Fields.SPEC_DECODING.value, "none") # Apply max-tp filter if specified - if args.max_tp and tp > args.max_tp: - continue + # If tp > max_tp, use max_tp instead of skipping (if valid) + if args.max_tp is not None: + if args.max_tp <= 0: + continue # Skip if max_tp is not positive + if tp > args.max_tp: + tp = args.max_tp # Apply max-ep filter if specified - if args.max_ep and ep is not None and ep > args.max_ep: - continue + # If ep > max_ep, use max_ep instead of skipping (if valid) + if args.max_ep is not None: + if args.max_ep <= 0: + continue # Skip if max_ep is not positive + if ep is not None and ep > args.max_ep: + ep = args.max_ep # Apply max-conc filter if specified + # If conc_start > max_conc, use max_conc as both start and end (if valid) if args.max_conc is not None: - conc_end = min(conc_end, args.max_conc) - if conc_start > conc_end: - continue # Skip this bmk if conc_start exceeds max_conc + if args.max_conc <= 0: + continue # Skip if max_conc is not positive + if conc_start > args.max_conc: + conc_start = args.max_conc + conc_end = args.max_conc + else: + conc_end = min(conc_end, args.max_conc) conc = conc_start while conc <= conc_end: diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py index 292ad3189..10eaf2568 100644 --- a/utils/matrix-logic/test_generate_sweep_configs.py +++ b/utils/matrix-logic/test_generate_sweep_configs.py @@ -324,16 +324,67 @@ def test_max_conc_filter(self, sample_single_node_config, sample_runner_config, assert len(result) == 3 assert all(entry["conc"] <= 16 for entry in result) + def test_max_conc_creates_config_when_below_min(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_conc below config's min should create config with max_conc value.""" + # Config has conc-start=4, so max_conc=1 should create entry with conc=1 + full_sweep_args_single_node.max_conc = 1 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + # Should create 1 entry with conc=1 + assert len(result) == 1 + assert result[0]["conc"] == 1 + + def test_max_conc_zero_or_negative_skips(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_conc of 0 or negative should skip configs.""" + for invalid_value in [0, -1, -100]: + full_sweep_args_single_node.max_conc = invalid_value + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0, f"Expected 0 results for max_conc={invalid_value}" + def test_max_tp_filter(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): - """max_tp filter should limit TP values.""" + """max_tp filter should use max_tp when config tp exceeds it.""" full_sweep_args_single_node.max_tp = 4 + full_sweep_args_single_node.seq_lens = ["1k1k"] result = generate_full_sweep( full_sweep_args_single_node, sample_single_node_config, sample_runner_config ) - # tp=8 is filtered out, so no results - assert len(result) == 0 + # tp=8 in config, but max_tp=4, so should use tp=4 + assert len(result) > 0 + assert all(entry["tp"] == 4 for entry in result) + + def test_max_tp_creates_config_when_below_min(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_tp below config's tp should create config with max_tp value.""" + # Config has tp=8, so max_tp=2 should create entries with tp=2 + full_sweep_args_single_node.max_tp = 2 + full_sweep_args_single_node.seq_lens = ["1k1k"] + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) > 0 + assert all(entry["tp"] == 2 for entry in result) + + def test_max_tp_zero_or_negative_skips(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): + """max_tp of 0 or negative should skip configs.""" + for invalid_value in [0, -1, -100]: + full_sweep_args_single_node.max_tp = invalid_value + result = generate_full_sweep( + full_sweep_args_single_node, + sample_single_node_config, + sample_runner_config + ) + assert len(result) == 0, f"Expected 0 results for max_tp={invalid_value}" def test_step_size(self, sample_single_node_config, sample_runner_config, full_sweep_args_single_node): """Different step sizes should affect concurrency progression.""" @@ -789,3 +840,193 @@ def test_multinode_conc_range_expansion(self, sample_runner_config, full_sweep_a assert len(result) == 1 # step_size=2: 1, 2, 4, 8 assert result[0]["conc"] == [1, 2, 4, 8] + + def test_max_ep_creates_config_when_below_min(self, sample_runner_config, full_sweep_args_single_node): + """max_ep below config's ep should create config with max_ep value.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "sglang", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + full_sweep_args_single_node.max_ep = 2 + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + # ep=8 in config, but max_ep=2, so should use ep=2 + assert len(result) == 1 + assert result[0]["ep"] == 2 + + def test_max_ep_zero_or_negative_skips(self, sample_runner_config, full_sweep_args_single_node): + """max_ep of 0 or negative should skip configs.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "sglang", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "ep": 8, "conc-start": 4, "conc-end": 4} + ] + } + ] + } + } + for invalid_value in [0, -1, -100]: + full_sweep_args_single_node.max_ep = invalid_value + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + assert len(result) == 0, f"Expected 0 results for max_ep={invalid_value}" + + def test_multinode_max_conc_zero_or_negative_skips(self, sample_runner_config, full_sweep_args_multi_node): + """Multinode max_conc of 0 or negative should skip configs.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-list": [100, 200, 400], + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": False, + }, + } + ] + } + ] + } + } + for invalid_value in [0, -1, -100]: + full_sweep_args_multi_node.max_conc = invalid_value + result = generate_full_sweep( + full_sweep_args_multi_node, + config, + sample_runner_config + ) + assert len(result) == 0, f"Expected 0 results for max_conc={invalid_value}" + + def test_multinode_max_conc_creates_config_when_below_min(self, sample_runner_config, full_sweep_args_multi_node): + """Multinode max_conc below all values should create config with max_conc.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "dynamo-trt", + "runner": "gb200", + "multinode": True, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + { + "conc-list": [100, 200, 400], + "prefill": { + "num-worker": 1, + "tp": 4, + "ep": 4, + "dp-attn": False, + }, + "decode": { + "num-worker": 1, + "tp": 8, + "ep": 8, + "dp-attn": False, + }, + } + ] + } + ] + } + } + full_sweep_args_multi_node.max_conc = 1 + result = generate_full_sweep( + full_sweep_args_multi_node, + config, + sample_runner_config + ) + # All conc values (100, 200, 400) > max_conc (1), so should use [1] + assert len(result) == 1 + assert result[0]["conc"] == [1] + + def test_combined_max_filters(self, sample_runner_config, full_sweep_args_single_node): + """Multiple max filters should all apply and create configs with max values.""" + config = { + "test-config": { + "image": "test-image", + "model": "test-model", + "model-prefix": "test", + "precision": "fp4", + "framework": "sglang", + "runner": "b200", + "multinode": False, + "seq-len-configs": [ + { + "isl": 1024, + "osl": 1024, + "search-space": [ + {"tp": 8, "ep": 8, "conc-start": 100, "conc-end": 200} + ] + } + ] + } + } + full_sweep_args_single_node.max_tp = 2 + full_sweep_args_single_node.max_ep = 1 + full_sweep_args_single_node.max_conc = 1 + result = generate_full_sweep( + full_sweep_args_single_node, + config, + sample_runner_config + ) + # All values exceed max, so should use max values + assert len(result) == 1 + assert result[0]["tp"] == 2 + assert result[0]["ep"] == 1 + assert result[0]["conc"] == 1 From 8aab1036225c5fa98d89d267e1399ff501ef487c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 1 Dec 2025 10:02:33 -0600 Subject: [PATCH 68/98] adding tests --- .github/configs/nvidia-master.yaml | 1478 ++++++++++++++-------------- 1 file changed, 739 insertions(+), 739 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e0578eba5..519d335c2 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -23,308 +23,308 @@ dsr1-fp4-b200-sglang: - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } -# dsr1-fp4-b200-trt: -# image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 -# model: nvidia/DeepSeek-R1-0528-FP4-V2 -# model-prefix: dsr1 -# runner: b200-trt -# precision: fp4 -# framework: trt -# multinode: false -# seq-len-configs: -# - isl: 1024 -# osl: 1024 -# search-space: -# # If TP=4, -# # If CONC > 32, then EP=4 -# # If CONC >= 256, DP_ATTN=true -# - { tp: 4, conc-start: 4, conc-end: 32 } -# - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } -# - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } -# # If TP=8, -# # If CONC > 8, then EP=8 -# # If CONC >= 256, DP_ATTN=true -# - { tp: 8, conc-start: 4, conc-end: 8 } -# - { tp: 8, ep: 8, conc-start: 16, conc-end: 128 } -# - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } -# - isl: 1024 -# osl: 8192 -# search-space: -# # If TP=4, -# # If CONC > 32, then EP=4 -# # If CONC >= 256, DP_ATTN=true -# - { tp: 4, conc-start: 4, conc-end: 32 } -# - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } -# - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } -# # If TP=8, -# # If CONC > 16, then EP=8 -# # If CONC >= 256, DP_ATTN=true -# - { tp: 8, conc-start: 4, conc-end: 16 } -# - { tp: 8, ep: 8, conc-start: 32, conc-end: 128 } -# - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } -# - isl: 8192 -# osl: 1024 -# search-space: -# # If TP=4, -# # If CONC > 32, then EP=4 and DP_ATTN=true -# - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } -# - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 } -# # If TP=8, -# # If CONC > 32, then EP=8 and DP_ATTN=true -# - { tp: 8, conc-start: 4, conc-end: 32 } -# - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } +dsr1-fp4-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: nvidia/DeepSeek-R1-0528-FP4-V2 + model-prefix: dsr1 + runner: b200-trt + precision: fp4 + framework: trt + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # If TP=4, + # If CONC > 32, then EP=4 + # If CONC >= 256, DP_ATTN=true + - { tp: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } + # If TP=8, + # If CONC > 8, then EP=8 + # If CONC >= 256, DP_ATTN=true + - { tp: 8, conc-start: 4, conc-end: 8 } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } + - isl: 1024 + osl: 8192 + search-space: + # If TP=4, + # If CONC > 32, then EP=4 + # If CONC >= 256, DP_ATTN=true + - { tp: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 4, conc-start: 64, conc-end: 128 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 256, conc-end: 256 } + # If TP=8, + # If CONC > 16, then EP=8 + # If CONC >= 256, DP_ATTN=true + - { tp: 8, conc-start: 4, conc-end: 16 } + - { tp: 8, ep: 8, conc-start: 32, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 256 } + - isl: 8192 + osl: 1024 + search-space: + # If TP=4, + # If CONC > 32, then EP=4 and DP_ATTN=true + - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 4, dp-attn: true, conc-start: 64, conc-end: 256 } + # If TP=8, + # If CONC > 32, then EP=8 and DP_ATTN=true + - { tp: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 256 } -# dsr1-fp8-b200-sglang: -# image: lmsysorg/sglang:v0.5.5-cu129-amd64 -# model: deepseek-ai/DeepSeek-R1-0528 -# model-prefix: dsr1 -# runner: b200 -# precision: fp8 -# framework: sglang -# multinode: false -# seq-len-configs: -# - isl: 1024 -# osl: 1024 -# search-space: -# - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } -# - isl: 1024 -# osl: 8192 -# search-space: -# - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } -# - isl: 8192 -# osl: 1024 -# search-space: -# - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } +dsr1-fp8-b200-sglang: + image: lmsysorg/sglang:v0.5.5-cu129-amd64 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: b200 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } -# dsr1-fp8-b200-trt: -# image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 -# model: deepseek-ai/DeepSeek-R1-0528 -# model-prefix: dsr1 -# runner: b200-trt -# precision: fp8 -# framework: trt -# multinode: false -# seq-len-configs: -# # For all sequence lengths, EP=TP -# - isl: 1024 -# osl: 1024 -# search-space: -# # If CONC > 32, then DP_ATTN=true -# - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } -# - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } -# - isl: 1024 -# osl: 8192 -# search-space: -# # If CONC > 64, then DP_ATTN=true -# - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } -# - isl: 8192 -# osl: 1024 -# search-space: -# # If CONC > 64, then DP_ATTN=true -# - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } +dsr1-fp8-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: b200-trt + precision: fp8 + framework: trt + multinode: false + seq-len-configs: + # For all sequence lengths, EP=TP + - isl: 1024 + osl: 1024 + search-space: + # If CONC > 32, then DP_ATTN=true + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } + - isl: 1024 + osl: 8192 + search-space: + # If CONC > 64, then DP_ATTN=true + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + # If CONC > 64, then DP_ATTN=true + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } -# dsr1-fp8-h200-sglang: -# image: lmsysorg/sglang:v0.5.5-cu129-amd64 -# model: deepseek-ai/DeepSeek-R1-0528 -# model-prefix: dsr1 -# runner: h200 -# precision: fp8 -# framework: sglang -# multinode: false -# seq-len-configs: -# - isl: 1024 -# osl: 1024 -# search-space: -# - { tp: 8, conc-start: 4, conc-end: 64 } -# - isl: 1024 -# osl: 8192 -# search-space: -# - { tp: 8, conc-start: 4, conc-end: 64 } -# - isl: 8192 -# osl: 1024 -# search-space: -# - { tp: 8, conc-start: 4, conc-end: 64 } +dsr1-fp8-h200-sglang: + image: lmsysorg/sglang:v0.5.5-cu129-amd64 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: h200 + precision: fp8 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, conc-start: 4, conc-end: 64 } -# dsr1-fp8-h200-trt: -# image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 -# model: deepseek-ai/DeepSeek-R1-0528 -# model-prefix: dsr1 -# runner: h200 -# precision: fp8 -# framework: trt -# multinode: false -# # For all sequence lengths, EP=TP -# seq-len-configs: -# - isl: 1024 -# osl: 1024 -# # If CONC > 64, then DP_ATTN=true -# search-space: -# - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } -# - isl: 1024 -# osl: 8192 -# # If CONC > 64, then DP_ATTN=true -# search-space: -# - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } -# - isl: 8192 -# osl: 1024 -# # If CONC > 32, then DP_ATTN=true -# search-space: -# - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } -# - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } +dsr1-fp8-h200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.1.0rc2.post2 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: h200 + precision: fp8 + framework: trt + multinode: false + # For all sequence lengths, EP=TP + seq-len-configs: + - isl: 1024 + osl: 1024 + # If CONC > 64, then DP_ATTN=true + search-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + # If CONC > 64, then DP_ATTN=true + search-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + # If CONC > 32, then DP_ATTN=true + search-space: + - { tp: 8, ep: 8, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 64 } -# gptoss-fp4-b200-trt: -# image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1 -# model: openai/gpt-oss-120b -# model-prefix: gptoss -# runner: b200-trt -# precision: fp4 -# framework: trt -# multinode: false -# # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true -# seq-len-configs: -# - isl: 1024 -# osl: 1024 -# search-space: -# - { tp: 1, conc-start: 64, conc-end: 128 } -# - { tp: 2, conc-start: 4, conc-end: 128 } -# - { tp: 4, conc-start: 4, conc-end: 128 } -# - { tp: 8, conc-start: 4, conc-end: 8 } -# - isl: 1024 -# osl: 8192 -# search-space: -# - { tp: 1, conc-start: 64, conc-end: 128 } -# - { tp: 2, conc-start: 4, conc-end: 128 } -# - { tp: 4, conc-start: 4, conc-end: 128 } -# - { tp: 8, conc-start: 4, conc-end: 16 } -# - isl: 8192 -# osl: 1024 -# search-space: -# - { tp: 1, conc-start: 64, conc-end: 128 } -# - { tp: 2, conc-start: 4, conc-end: 128 } -# - { tp: 4, conc-start: 4, conc-end: 128 } -# - { tp: 8, conc-start: 4, conc-end: 8 } +gptoss-fp4-b200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:1.2.0rc0.post1 + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: b200-trt + precision: fp4 + framework: trt + multinode: false + # For all sequence lengths, if CONC >= 256, then EP=TP and DP_ATTN=true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 64, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 8 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 1, conc-start: 64, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 16 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 64, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, conc-start: 4, conc-end: 8 } -# gptoss-fp4-b200-vllm: -# image: vllm/vllm-openai:v0.11.0 -# model: openai/gpt-oss-120b -# model-prefix: gptoss -# runner: b200 -# precision: fp4 -# framework: vllm -# multinode: false -# seq-len-configs: -# - isl: 1024 -# osl: 1024 -# search-space: -# - { tp: 1, conc-start: 4, conc-end: 128 } -# - { tp: 2, conc-start: 4, conc-end: 128 } -# - { tp: 4, conc-start: 4, conc-end: 64 } -# - { tp: 8, conc-start: 4, conc-end: 8 } -# - isl: 1024 -# osl: 8192 -# search-space: -# - { tp: 1, conc-start: 4, conc-end: 128 } -# - { tp: 2, conc-start: 4, conc-end: 128 } -# - { tp: 4, conc-start: 4, conc-end: 64 } -# - { tp: 8, conc-start: 4, conc-end: 8 } -# - isl: 8192 -# osl: 1024 -# search-space: -# - { tp: 1, conc-start: 4, conc-end: 128 } -# - { tp: 2, conc-start: 4, conc-end: 128 } -# - { tp: 4, conc-start: 4, conc-end: 64 } -# - { tp: 8, conc-start: 4, conc-end: 4 } +gptoss-fp4-b200-vllm: + image: vllm/vllm-openai:v0.11.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: b200 + precision: fp4 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 8 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 8 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 128 } + - { tp: 2, conc-start: 4, conc-end: 128 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 4 } -# gptoss-fp4-h100-vllm: -# image: vllm/vllm-openai:v0.11.0 -# model: openai/gpt-oss-120b -# model-prefix: gptoss -# runner: h100 -# precision: fp4 -# framework: vllm -# multinode: false -# seq-len-configs: -# - isl: 1024 -# osl: 1024 -# search-space: -# - { tp: 2, conc-start: 4, conc-end: 64 } -# - { tp: 4, conc-start: 4, conc-end: 64 } -# - { tp: 8, conc-start: 4, conc-end: 64 } -# - isl: 1024 -# osl: 8192 -# search-space: -# - { tp: 2, conc-start: 4, conc-end: 64 } -# - { tp: 4, conc-start: 4, conc-end: 64 } -# - { tp: 8, conc-start: 4, conc-end: 64 } -# - isl: 8192 -# osl: 1024 -# search-space: -# - { tp: 2, conc-start: 4, conc-end: 64 } -# - { tp: 4, conc-start: 4, conc-end: 64 } -# - { tp: 8, conc-start: 4, conc-end: 16 } +gptoss-fp4-h100-vllm: + image: vllm/vllm-openai:v0.11.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: h100 + precision: fp4 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 16 } -# gptoss-fp4-h200-trt: -# image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev -# model: openai/gpt-oss-120b -# model-prefix: gptoss -# runner: h200 -# precision: fp4 -# framework: trt -# multinode: false -# # For all sequence lengths, EP=TP, DP_ATTENTION=false -# seq-len-configs: -# - isl: 1024 -# osl: 1024 -# search-space: -# - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } -# - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } -# - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 32 } -# - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } -# - isl: 1024 -# osl: 8192 -# search-space: -# - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } -# - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } -# - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } -# - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } -# - isl: 8192 -# osl: 1024 -# search-space: -# - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } -# - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } -# - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } -# - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } +gptoss-fp4-h200-trt: + image: nvcr.io#nvidia/tensorrt-llm/release:gpt-oss-dev + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: h200 + precision: fp4 + framework: trt + multinode: false + # For all sequence lengths, EP=TP, DP_ATTENTION=false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, ep: 1, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 2, ep: 2, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, dp-attn: false, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 8, dp-attn: false, conc-start: 4, conc-end: 8 } -# gptoss-fp4-h200-vllm: -# image: vllm/vllm-openai:v0.11.0 -# model: openai/gpt-oss-120b -# model-prefix: gptoss -# runner: h200 -# precision: fp4 -# framework: vllm -# multinode: false -# seq-len-configs: -# - isl: 1024 -# osl: 1024 -# search-space: -# - { tp: 1, conc-start: 4, conc-end: 4 } -# - { tp: 2, conc-start: 4, conc-end: 64 } -# - { tp: 4, conc-start: 4, conc-end: 64 } -# - { tp: 8, conc-start: 4, conc-end: 64 } -# - isl: 1024 -# osl: 8192 -# search-space: -# - { tp: 1, conc-start: 4, conc-end: 4 } -# - { tp: 2, conc-start: 4, conc-end: 64 } -# - { tp: 4, conc-start: 4, conc-end: 64 } -# - { tp: 8, conc-start: 4, conc-end: 64 } -# - isl: 8192 -# osl: 1024 -# search-space: -# - { tp: 1, conc-start: 4, conc-end: 64 } -# - { tp: 2, conc-start: 4, conc-end: 64 } -# - { tp: 4, conc-start: 4, conc-end: 64 } -# - { tp: 8, conc-start: 4, conc-end: 32 } +gptoss-fp4-h200-vllm: + image: vllm/vllm-openai:v0.11.0 + model: openai/gpt-oss-120b + model-prefix: gptoss + runner: h200 + precision: fp4 + framework: vllm + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 4 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 1024 + osl: 8192 + search-space: + - { tp: 1, conc-start: 4, conc-end: 4 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 64 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 1, conc-start: 4, conc-end: 64 } + - { tp: 2, conc-start: 4, conc-end: 64 } + - { tp: 4, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 32 } dsr1-fp4-gb200-dynamo-trt: image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 @@ -339,414 +339,414 @@ dsr1-fp4-gb200-dynamo-trt: - isl: 1024 osl: 1024 search-space: - # # MTP configurations - # # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - # # NOTE: Prefill tp and ep are always 4 because each GB200 node has 4 GPUs and - # # ctx_tp_size is hardcoded to 4 in launch_gb200-nv.sh. Decode tp/ep matches gen_tp_size. - # # For 1k/1k: prefill batch-size=4, max-num-tokens=4608 - # - spec-decoding: "mtp" - # conc-list: [ 1, 2, 4, 8, 16, 36 ] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: false - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=4608" - # - "PREFILL_MAX_BATCH_SIZE=4" - # decode: - # num-worker: 4 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=128" - # - "DECODE_MAX_BATCH_SIZE=32" - # - "DECODE_GPU_MEM_FRACTION=0.9" - # - "DECODE_MTP_SIZE=3" + # MTP configurations + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + # NOTE: Prefill tp and ep are always 4 because each GB200 node has 4 GPUs and + # ctx_tp_size is hardcoded to 4 in launch_gb200-nv.sh. Decode tp/ep matches gen_tp_size. + # For 1k/1k: prefill batch-size=4, max-num-tokens=4608 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8, 16, 36 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_MAX_NUM_TOKENS=128" + - "DECODE_MAX_BATCH_SIZE=32" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=3" - # # dep - Run Data-Expert Parallel mode (attention_dp=true) - # - spec-decoding: "mtp" - # conc-list: [ 512, 1075 ] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=4608" - # - "PREFILL_MAX_BATCH_SIZE=4" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=256" - # - "DECODE_MAX_BATCH_SIZE=64" - # - "DECODE_GPU_MEM_FRACTION=0.7" - # - "DECODE_MTP_SIZE=3" + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - spec-decoding: "mtp" + conc-list: [ 512, 1075 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=3" - # - spec-decoding: "mtp" - # conc-list: [ 2150 ] - # prefill: - # num-worker: 2 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=4608" - # - "PREFILL_MAX_BATCH_SIZE=4" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=256" - # - "DECODE_MAX_BATCH_SIZE=128" - # - "DECODE_GPU_MEM_FRACTION=0.7" - # - "DECODE_MTP_SIZE=1" + - spec-decoding: "mtp" + conc-list: [ 2150 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=1" - # - spec-decoding: "mtp" - # conc-list: [ 512 ] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=4608" - # - "PREFILL_MAX_BATCH_SIZE=4" - # decode: - # num-worker: 1 - # tp: 32 - # ep: 32 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=64" - # - "DECODE_MAX_BATCH_SIZE=16" - # - "DECODE_GPU_MEM_FRACTION=0.6" - # - "DECODE_MTP_SIZE=3" + - spec-decoding: "mtp" + conc-list: [ 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.6" + - "DECODE_MTP_SIZE=3" - # - spec-decoding: "mtp" - # conc-list: [ 2252 ] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=4608" - # - "PREFILL_MAX_BATCH_SIZE=4" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=512" - # - "DECODE_MAX_BATCH_SIZE=256" - # - "DECODE_GPU_MEM_FRACTION=0.8" - # - "DECODE_MTP_SIZE=1" + - spec-decoding: "mtp" + conc-list: [ 2252 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=512" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=1" - # # Non-MTP configurations (default spec_decoding="none") - # # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - # - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 141 ] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: false - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=4608" - # - "PREFILL_MAX_BATCH_SIZE=4" - # decode: - # num-worker: 4 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=128" - # - "DECODE_MAX_BATCH_SIZE=128" - # - "DECODE_GPU_MEM_FRACTION=0.9" - # - "DECODE_MTP_SIZE=0" + # Non-MTP configurations (default spec_decoding="none") + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 141 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_MAX_NUM_TOKENS=128" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=0" - # # dep - Run Data-Expert Parallel mode (attention_dp=true) - # - conc-list: [ 1075 ] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=4608" - # - "PREFILL_MAX_BATCH_SIZE=4" - # decode: - # num-worker: 1 - # tp: 32 - # ep: 32 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=32" - # - "DECODE_MAX_BATCH_SIZE=32" - # - "DECODE_GPU_MEM_FRACTION=0.7" - # - "DECODE_MTP_SIZE=0" + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - conc-list: [ 1075 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=32" + - "DECODE_MAX_BATCH_SIZE=32" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=0" - # - conc-list: [ 1075 ] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=4608" - # - "PREFILL_MAX_BATCH_SIZE=4" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=64" - # - "DECODE_MAX_BATCH_SIZE=64" - # - "DECODE_GPU_MEM_FRACTION=0.75" - # - "DECODE_MTP_SIZE=0" + - conc-list: [ 1075 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" - # - conc-list: [ 2048, 4300 ] - # prefill: - # num-worker: 2 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=4608" - # - "PREFILL_MAX_BATCH_SIZE=4" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=256" - # - "DECODE_MAX_BATCH_SIZE=256" - # - "DECODE_GPU_MEM_FRACTION=0.75" - # - "DECODE_MTP_SIZE=0" + - conc-list: [ 2048, 4300 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" - # - conc-list: [ 4300 ] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=4608" - # - "PREFILL_MAX_BATCH_SIZE=4" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=512" - # - "DECODE_MAX_BATCH_SIZE=512" - # - "DECODE_GPU_MEM_FRACTION=0.8" - # - "DECODE_MTP_SIZE=0" + - conc-list: [ 4300 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=512" + - "DECODE_MAX_BATCH_SIZE=512" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=0" - # - isl: 8192 - # osl: 1024 - # search-space: - # # MTP configurations (spec_decoding="mtp") - # # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - # # For 8k/1k: prefill batch-size=1, max-num-tokens=8448 - # - spec-decoding: "mtp" - # conc-list: [ 1, 2, 4, 8, 18 ] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: false - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=8448" - # - "PREFILL_MAX_BATCH_SIZE=1" - # decode: - # num-worker: 3 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=64" - # - "DECODE_MAX_BATCH_SIZE=16" - # - "DECODE_GPU_MEM_FRACTION=0.9" - # - "DECODE_MTP_SIZE=3" + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations (spec_decoding="mtp") + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + # For 8k/1k: prefill batch-size=1, max-num-tokens=8448 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8, 18 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=3" - # # dep - Run Data-Expert Parallel mode (attention_dp=true) - # - spec-decoding: "mtp" - # conc-list: [ 128, 269 ] - # prefill: - # num-worker: 5 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=8448" - # - "PREFILL_MAX_BATCH_SIZE=1" - # decode: - # num-worker: 1 - # tp: 32 - # ep: 32 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=32" - # - "DECODE_MAX_BATCH_SIZE=8" - # - "DECODE_GPU_MEM_FRACTION=0.7" - # - "DECODE_MTP_SIZE=3" + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - spec-decoding: "mtp" + conc-list: [ 128, 269 ] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=32" + - "DECODE_MAX_BATCH_SIZE=8" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=3" - # - spec-decoding: "mtp" - # conc-list: [ 538 ] - # prefill: - # num-worker: 8 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=8448" - # - "PREFILL_MAX_BATCH_SIZE=1" - # decode: - # num-worker: 1 - # tp: 32 - # ep: 32 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=64" - # - "DECODE_MAX_BATCH_SIZE=16" - # - "DECODE_GPU_MEM_FRACTION=0.7" - # - "DECODE_MTP_SIZE=3" + - spec-decoding: "mtp" + conc-list: [ 538 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=3" - # - spec-decoding: "mtp" - # conc-list: [ 1075 ] - # prefill: - # num-worker: 8 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=8448" - # - "PREFILL_MAX_BATCH_SIZE=1" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=256" - # - "DECODE_MAX_BATCH_SIZE=64" - # - "DECODE_GPU_MEM_FRACTION=0.75" - # - "DECODE_MTP_SIZE=2" + - spec-decoding: "mtp" + conc-list: [ 1075 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=2" - # - spec-decoding: "mtp" - # conc-list: [ 2150 ] - # prefill: - # num-worker: 6 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=8448" - # - "PREFILL_MAX_BATCH_SIZE=1" - # decode: - # num-worker: 1 - # tp: 8 - # ep: 8 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=512" - # - "DECODE_MAX_BATCH_SIZE=256" - # - "DECODE_GPU_MEM_FRACTION=0.8" - # - "DECODE_MTP_SIZE=1" + - spec-decoding: "mtp" + conc-list: [ 2150 ] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=512" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=1" - # # Non-MTP configurations (default spec_decoding="none") - # # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - # - conc-list: [ 1, 2, 4, 8, 16, 34 ] - # prefill: - # num-worker: 1 - # tp: 4 - # ep: 4 - # dp-attn: false - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=8448" - # - "PREFILL_MAX_BATCH_SIZE=1" - # decode: - # num-worker: 3 - # tp: 8 - # ep: 8 - # dp-attn: false - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=32" - # - "DECODE_MAX_BATCH_SIZE=32" - # - "DECODE_GPU_MEM_FRACTION=0.9" - # - "DECODE_MTP_SIZE=0" + # Non-MTP configurations (default spec_decoding="none") + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + - conc-list: [ 1, 2, 4, 8, 16, 34 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_MAX_NUM_TOKENS=32" + - "DECODE_MAX_BATCH_SIZE=32" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=0" - # # dep - Run Data-Expert Parallel mode (attention_dp=true) - # - conc-list: [ 256, 538 ] - # prefill: - # num-worker: 4 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=8448" - # - "PREFILL_MAX_BATCH_SIZE=1" - # decode: - # num-worker: 1 - # tp: 32 - # ep: 32 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=16" - # - "DECODE_MAX_BATCH_SIZE=16" - # - "DECODE_GPU_MEM_FRACTION=0.7" - # - "DECODE_MTP_SIZE=0" + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - conc-list: [ 256, 538 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=16" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=0" - # - conc-list: [ 1075 ] - # prefill: - # num-worker: 6 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=8448" - # - "PREFILL_MAX_BATCH_SIZE=1" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=64" - # - "DECODE_MAX_BATCH_SIZE=64" - # - "DECODE_GPU_MEM_FRACTION=0.75" - # - "DECODE_MTP_SIZE=0" + - conc-list: [ 1075 ] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" - # - conc-list: [ 2150 ] - # prefill: - # num-worker: 8 - # tp: 4 - # ep: 4 - # dp-attn: true - # additional-settings: - # - "PREFILL_MAX_NUM_TOKENS=8448" - # - "PREFILL_MAX_BATCH_SIZE=1" - # decode: - # num-worker: 1 - # tp: 16 - # ep: 16 - # dp-attn: true - # additional-settings: - # - "DECODE_MAX_NUM_TOKENS=128" - # - "DECODE_MAX_BATCH_SIZE=128" - # - "DECODE_GPU_MEM_FRACTION=0.75" - # - "DECODE_MTP_SIZE=0" + - conc-list: [ 2150 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=128" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" - conc-list: [ 2150 ] prefill: @@ -768,59 +768,59 @@ dsr1-fp4-gb200-dynamo-trt: - "DECODE_GPU_MEM_FRACTION=0.8" - "DECODE_MTP_SIZE=0" -# dsr1-fp8-gb200-dynamo-sglang: -# image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 -# model: deepseek-ai/DeepSeek-R1-0528 -# model-prefix: dsr1 -# runner: gb200 -# precision: fp8 -# framework: dynamo-sglang -# multinode: true -# disagg: true -# seq-len-configs: -# - isl: 1024 -# osl: 1024 -# search-space: -# - spec-decoding: "none" -# # conc-list: [ 1024, 2048, 4096, 4608, 4864, 4992, 5120, 5376, 5632, 6144, 8192 ] -# conc-list: [ 1024, 2048 ] -# prefill: -# num-worker: 3 -# # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: -# # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh -# tp: 1 -# ep: 1 -# dp-attn: true -# additional-settings: -# - "PREFILL_NODES=6" -# - "N_ADDITIONAL_FRONTENDS=8" -# decode: -# num-worker: 1 -# tp: 1 -# ep: 1 -# dp-attn: true -# additional-settings: -# - "DECODE_NODES=12" -# - isl: 8192 -# osl: 1024 -# search-space: -# - spec-decoding: "none" -# # conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] -# conc-list: [ 128 ] -# prefill: -# num-worker: 6 -# # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: -# # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh -# tp: 1 -# ep: 1 -# dp-attn: true -# additional-settings: -# - "PREFILL_NODES=12" -# - "N_ADDITIONAL_FRONTENDS=8" -# decode: -# num-worker: 1 -# tp: 1 -# ep: 1 -# dp-attn: true -# additional-settings: -# - "DECODE_NODES=6" +dsr1-fp8-gb200-dynamo-sglang: + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: gb200 + precision: fp8 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - spec-decoding: "none" + # conc-list: [ 1024, 2048, 4096, 4608, 4864, 4992, 5120, 5376, 5632, 6144, 8192 ] + conc-list: [ 1024, 2048 ] + prefill: + num-worker: 3 + # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: + # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=6" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=12" + - isl: 8192 + osl: 1024 + search-space: + - spec-decoding: "none" + # conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] + conc-list: [ 128 ] + prefill: + num-worker: 6 + # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: + # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=12" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=6" From af3b8bee9b87fa662baac8973c91dab1aaf4920a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 1 Dec 2025 10:19:38 -0600 Subject: [PATCH 69/98] adding tests --- .github/configs/nvidia-master.yaml | 402 +++++++++---------- utils/matrix-logic/generate_sweep_configs.py | 121 ++++-- 2 files changed, 284 insertions(+), 239 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 519d335c2..052c040dd 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -365,197 +365,197 @@ dsr1-fp4-gb200-dynamo-trt: - "DECODE_GPU_MEM_FRACTION=0.9" - "DECODE_MTP_SIZE=3" - # dep - Run Data-Expert Parallel mode (attention_dp=true) - - spec-decoding: "mtp" - conc-list: [ 512, 1075 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=64" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=3" + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - spec-decoding: "mtp" + conc-list: [ 512, 1075 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=3" - - spec-decoding: "mtp" - conc-list: [ 2150 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=128" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=1" + - spec-decoding: "mtp" + conc-list: [ 2150 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=1" - - spec-decoding: "mtp" - conc-list: [ 512 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=16" - - "DECODE_GPU_MEM_FRACTION=0.6" - - "DECODE_MTP_SIZE=3" + - spec-decoding: "mtp" + conc-list: [ 512 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.6" + - "DECODE_MTP_SIZE=3" - - spec-decoding: "mtp" - conc-list: [ 2252 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=512" - - "DECODE_MAX_BATCH_SIZE=256" - - "DECODE_GPU_MEM_FRACTION=0.8" - - "DECODE_MTP_SIZE=1" + - spec-decoding: "mtp" + conc-list: [ 2252 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=512" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=1" - # Non-MTP configurations (default spec_decoding="none") - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 141 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 4 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_MAX_NUM_TOKENS=128" - - "DECODE_MAX_BATCH_SIZE=128" - - "DECODE_GPU_MEM_FRACTION=0.9" - - "DECODE_MTP_SIZE=0" + # Non-MTP configurations (default spec_decoding="none") + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 141 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_MAX_NUM_TOKENS=128" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=0" - # dep - Run Data-Expert Parallel mode (attention_dp=true) - - conc-list: [ 1075 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=32" - - "DECODE_MAX_BATCH_SIZE=32" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=0" + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - conc-list: [ 1075 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=32" + - "DECODE_MAX_BATCH_SIZE=32" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=0" - - conc-list: [ 1075 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=64" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=0" + - conc-list: [ 1075 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" - - conc-list: [ 2048, 4300 ] - prefill: - num-worker: 2 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=256" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=0" + - conc-list: [ 2048, 4300 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" - - conc-list: [ 4300 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=512" - - "DECODE_MAX_BATCH_SIZE=512" - - "DECODE_GPU_MEM_FRACTION=0.8" - - "DECODE_MTP_SIZE=0" + - conc-list: [ 4300 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=4608" + - "PREFILL_MAX_BATCH_SIZE=4" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=512" + - "DECODE_MAX_BATCH_SIZE=512" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=0" - - isl: 8192 - osl: 1024 - search-space: + - isl: 8192 + osl: 1024 + search-space: # MTP configurations (spec_decoding="mtp") # tep - Run Tensor-Expert Parallel mode (attention_dp=false) # For 8k/1k: prefill batch-size=1, max-num-tokens=8448 @@ -748,25 +748,25 @@ dsr1-fp4-gb200-dynamo-trt: - "DECODE_GPU_MEM_FRACTION=0.75" - "DECODE_MTP_SIZE=0" - - conc-list: [ 2150 ] - prefill: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=256" - - "DECODE_GPU_MEM_FRACTION=0.8" - - "DECODE_MTP_SIZE=0" + - conc-list: [ 2150 ] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=0" dsr1-fp8-gb200-dynamo-sglang: image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index cccdae5ae..7fbcde2e7 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -236,6 +236,7 @@ def generate_runner_model_sweep_config(args, all_config_data, runner_data): """Generate runner-model sweep configurations. Assumes all_config_data has been validated by validate_config_structure(). + Supports both single-node and multinode configurations. """ runner_nodes = runner_data.get(args.runner_type) @@ -257,6 +258,8 @@ def generate_runner_model_sweep_config(args, all_config_data, runner_data): if val[Fields.RUNNER.value] != args.runner_type: continue + is_multinode = val.get(Fields.MULTINODE.value, False) + # Get model code for exp_name model_code = val[Fields.MODEL_PREFIX.value] # Get disagg value, defaulting to False if not specified @@ -269,44 +272,86 @@ def generate_runner_model_sweep_config(args, all_config_data, runner_data): target_config = config break - highest_tp_bmk = max( - target_config[Fields.SEARCH_SPACE.value], key=lambda x: x[Fields.TP.value]) - # Since we are just testing, pick the highest TP for this config and just test - # on that TP with the lowest concurrency available - highest_tp = highest_tp_bmk[Fields.TP.value] - lowest_conc = highest_tp_bmk[Fields.CONC_START.value] - - ep = highest_tp_bmk.get(Fields.EP.value) - dp_attn = highest_tp_bmk.get(Fields.DP_ATTN.value) - - for node in runner_nodes: - entry = { - Fields.IMAGE.value: val[Fields.IMAGE.value], - Fields.MODEL.value: val[Fields.MODEL.value], - Fields.PRECISION.value: val[Fields.PRECISION.value], - Fields.FRAMEWORK.value: val[Fields.FRAMEWORK.value], - # Add one entry for each node under specified runner type - Fields.RUNNER.value: node, - # Again, just use 1k1k since this is just meant to smoke test all runners - Fields.ISL.value: 1024, - Fields.OSL.value: 1024, - Fields.TP.value: highest_tp, - Fields.EP.value: 1, # Default, - Fields.DP_ATTN.value: False, # Default - Fields.SPEC_DECODING.value: "none", # Default - Fields.CONC.value: lowest_conc, - Fields.MAX_MODEL_LEN.value: 2048, - Fields.EXP_NAME.value: f"{model_code}_test", - Fields.DISAGG.value: disagg, - } - - # Add optional fields if they exist - if ep is not None: - entry[Fields.EP.value] = ep - if dp_attn is not None: - entry[Fields.DP_ATTN.value] = dp_attn - - matrix_values.append(entry) + if target_config is None: + continue + + if is_multinode: + # For multinode, find the search space entry with the lowest concurrency + def get_lowest_conc(search_space_entry): + conc_list = search_space_entry.get(Fields.CONC_LIST.value, []) + return min(conc_list) if conc_list else float('inf') + + lowest_conc_entry = min( + target_config[Fields.SEARCH_SPACE.value], key=get_lowest_conc) + + conc_list = lowest_conc_entry.get(Fields.CONC_LIST.value, []) + lowest_conc = min(conc_list) if conc_list else 1 + + spec_decoding = lowest_conc_entry.get( + Fields.SPEC_DECODING.value, "none") + prefill_config = lowest_conc_entry[Fields.PREFILL.value] + decode_config = lowest_conc_entry[Fields.DECODE.value] + + for node in runner_nodes: + entry = { + Fields.IMAGE.value: val[Fields.IMAGE.value], + Fields.MODEL.value: val[Fields.MODEL.value], + Fields.PRECISION.value: val[Fields.PRECISION.value], + Fields.FRAMEWORK.value: val[Fields.FRAMEWORK.value], + Fields.RUNNER.value: node, + Fields.ISL.value: 1024, + Fields.OSL.value: 1024, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.PREFILL.value: { + Fields.NUM_WORKER.value: prefill_config[Fields.NUM_WORKER.value], + Fields.TP.value: prefill_config[Fields.TP.value], + Fields.EP.value: prefill_config[Fields.EP.value], + Fields.DP_ATTN.value: prefill_config[Fields.DP_ATTN.value], + Fields.ADDITIONAL_SETTINGS.value: prefill_config.get(Fields.ADDITIONAL_SETTINGS.value, []), + }, + Fields.DECODE.value: { + Fields.NUM_WORKER.value: decode_config[Fields.NUM_WORKER.value], + Fields.TP.value: decode_config[Fields.TP.value], + Fields.EP.value: decode_config[Fields.EP.value], + Fields.DP_ATTN.value: decode_config[Fields.DP_ATTN.value], + Fields.ADDITIONAL_SETTINGS.value: decode_config.get(Fields.ADDITIONAL_SETTINGS.value, []), + }, + Fields.CONC.value: [lowest_conc], + Fields.MAX_MODEL_LEN.value: 2048, + Fields.EXP_NAME.value: f"{model_code}_test", + Fields.DISAGG.value: disagg, + } + matrix_values.append(validate_matrix_entry(entry, is_multinode=True)) + else: + # Single-node: pick highest TP config with lowest concurrency + highest_tp_bmk = max( + target_config[Fields.SEARCH_SPACE.value], key=lambda x: x[Fields.TP.value]) + highest_tp = highest_tp_bmk[Fields.TP.value] + lowest_conc = highest_tp_bmk[Fields.CONC_START.value] + + ep = highest_tp_bmk.get(Fields.EP.value) + dp_attn = highest_tp_bmk.get(Fields.DP_ATTN.value) + spec_decoding = highest_tp_bmk.get(Fields.SPEC_DECODING.value, "none") + + for node in runner_nodes: + entry = { + Fields.IMAGE.value: val[Fields.IMAGE.value], + Fields.MODEL.value: val[Fields.MODEL.value], + Fields.PRECISION.value: val[Fields.PRECISION.value], + Fields.FRAMEWORK.value: val[Fields.FRAMEWORK.value], + Fields.RUNNER.value: node, + Fields.ISL.value: 1024, + Fields.OSL.value: 1024, + Fields.TP.value: highest_tp, + Fields.EP.value: ep if ep is not None else 1, + Fields.DP_ATTN.value: dp_attn if dp_attn is not None else False, + Fields.SPEC_DECODING.value: spec_decoding, + Fields.CONC.value: lowest_conc, + Fields.MAX_MODEL_LEN.value: 2048, + Fields.EXP_NAME.value: f"{model_code}_test", + Fields.DISAGG.value: disagg, + } + matrix_values.append(validate_matrix_entry(entry, is_multinode=False)) return matrix_values From 84feeda7f86b47235925ccba496d44ff182a66c1 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 1 Dec 2025 10:31:37 -0600 Subject: [PATCH 70/98] adding tests --- .github/workflows/e2e-tests.yml | 6 +++++- utils/matrix-logic/generate_sweep_configs.py | 18 ++++++++++++++++++ .../test_generate_sweep_configs.py | 4 +++- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 4158c7a38..770470b03 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -1,5 +1,5 @@ name: End-to-End Tests -run-name: e2e Test - ${{ github.event.inputs.generate-cli-command }} +run-name: e2e Test - ${{ inputs.test-name || github.event.inputs.generate-cli-command }} on: workflow_dispatch: @@ -8,6 +8,10 @@ on: description: "Command passed to generate matrix script" required: true type: string + test-name: + description: "Name for this test run" + required: false + type: string jobs: get-jobs: diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 7fbcde2e7..666e21af8 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -260,6 +260,12 @@ def generate_runner_model_sweep_config(args, all_config_data, runner_data): is_multinode = val.get(Fields.MULTINODE.value, False) + # Skip configs that don't match the requested node type + if args.single_node and is_multinode: + continue + if args.multi_node and not is_multinode: + continue + # Get model code for exp_name model_code = val[Fields.MODEL_PREFIX.value] # Get disagg value, defaulting to False if not specified @@ -517,6 +523,18 @@ def main(): required=False, help='Filter runner nodes by substring match (e.g., "mi300x-amd" to only include nodes containing that string)' ) + test_node_group = test_config_parser.add_mutually_exclusive_group( + required=True) + test_node_group.add_argument( + '--single-node', + action='store_true', + help='Generate single-node configurations only' + ) + test_node_group.add_argument( + '--multi-node', + action='store_true', + help='Generate multi-node configurations only' + ) test_config_parser.add_argument( '-h', '--help', action='help', diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix-logic/test_generate_sweep_configs.py index 10eaf2568..1381f394e 100644 --- a/utils/matrix-logic/test_generate_sweep_configs.py +++ b/utils/matrix-logic/test_generate_sweep_configs.py @@ -484,11 +484,13 @@ class TestGenerateRunnerModelSweepConfig: @pytest.fixture def runner_sweep_args(self): - """Args for runner-model-sweep command.""" + """Args for runner-model-sweep command (single-node).""" args = argparse.Namespace() args.runner_type = "mi300x" args.runner_config = "runners.yaml" args.runner_node_filter = None + args.single_node = True + args.multi_node = False return args def test_basic_runner_sweep(self, sample_single_node_config, sample_runner_config, runner_sweep_args): From e8392775c59d303dadd763cb5be96347732a6b5b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 1 Dec 2025 10:53:46 -0600 Subject: [PATCH 71/98] adding tests --- .github/workflows/benchmark-multinode-tmpl.yml | 4 ++++ .github/workflows/benchmark-tmpl.yml | 4 ++++ .github/workflows/e2e-tests.yml | 2 ++ .github/workflows/full-sweep-1k1k-scheduler.yml | 4 ++++ .github/workflows/full-sweep-1k8k-scheduler.yml | 4 ++++ .github/workflows/full-sweep-8k1k-scheduler.yml | 4 ++++ utils/matrix-logic/generate_sweep_configs.py | 4 ++++ utils/matrix-logic/test_validation.py | 2 ++ utils/matrix-logic/validation.py | 2 ++ utils/process_result.py | 4 +++- 10 files changed, 33 insertions(+), 1 deletion(-) diff --git a/.github/workflows/benchmark-multinode-tmpl.yml b/.github/workflows/benchmark-multinode-tmpl.yml index d5561739f..dae671e29 100644 --- a/.github/workflows/benchmark-multinode-tmpl.yml +++ b/.github/workflows/benchmark-multinode-tmpl.yml @@ -12,6 +12,9 @@ on: model: required: true type: string + model-prefix: + required: true + type: string framework: required: true type: string @@ -82,6 +85,7 @@ on: env: EXP_NAME: ${{ inputs.exp-name }} IMAGE: ${{ inputs.image }} + MODEL_PREFIX: ${{ inputs.model-prefix }} FRAMEWORK: ${{ inputs.framework }} PRECISION: ${{ inputs.precision }} ISL: ${{ inputs.isl }} diff --git a/.github/workflows/benchmark-tmpl.yml b/.github/workflows/benchmark-tmpl.yml index edf8454a0..29dbb35e3 100644 --- a/.github/workflows/benchmark-tmpl.yml +++ b/.github/workflows/benchmark-tmpl.yml @@ -11,6 +11,9 @@ on: model: required: true type: string + model-prefix: + required: true + type: string precision: required: true type: string @@ -57,6 +60,7 @@ env: HF_HUB_CACHE: '/mnt/hf_hub_cache/' EXP_NAME: ${{ inputs.exp-name }} MODEL: ${{ inputs.model }} + MODEL_PREFIX: ${{ inputs.model-prefix }} ISL: ${{ inputs.isl }} OSL: ${{ inputs.osl }} MAX_MODEL_LEN: ${{ inputs.max-model-len }} diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index 770470b03..b4398b49c 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -51,6 +51,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} exp-name: ${{ matrix.config.exp-name }} @@ -88,6 +89,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 38132ceab..0aac4a598 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -55,6 +55,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} exp-name: "dsr1_1k1k" @@ -92,6 +93,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} @@ -118,6 +120,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} exp-name: "dsr1_1k1k" @@ -155,6 +158,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index 8f0571eea..911c59c87 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -55,6 +55,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} exp-name: "dsr1_1k8k" @@ -92,6 +93,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} @@ -118,6 +120,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} exp-name: "dsr1_1k8k" @@ -155,6 +158,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index 3a59c9987..67bfeaa9a 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -55,6 +55,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} exp-name: "dsr1_8k1k" @@ -92,6 +93,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} @@ -118,6 +120,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} exp-name: "dsr1_8k1k" @@ -155,6 +158,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix-logic/generate_sweep_configs.py index 666e21af8..8fc47651c 100644 --- a/utils/matrix-logic/generate_sweep_configs.py +++ b/utils/matrix-logic/generate_sweep_configs.py @@ -142,6 +142,7 @@ def generate_full_sweep(args, all_config_data, runner_data): entry = { Fields.IMAGE.value: image, Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, Fields.PRECISION.value: precision, Fields.FRAMEWORK.value: framework, Fields.RUNNER.value: runner, @@ -200,6 +201,7 @@ def generate_full_sweep(args, all_config_data, runner_data): entry = { Fields.IMAGE.value: image, Fields.MODEL.value: model, + Fields.MODEL_PREFIX.value: model_code, Fields.PRECISION.value: precision, Fields.FRAMEWORK.value: framework, Fields.RUNNER.value: runner, @@ -302,6 +304,7 @@ def get_lowest_conc(search_space_entry): entry = { Fields.IMAGE.value: val[Fields.IMAGE.value], Fields.MODEL.value: val[Fields.MODEL.value], + Fields.MODEL_PREFIX.value: model_code, Fields.PRECISION.value: val[Fields.PRECISION.value], Fields.FRAMEWORK.value: val[Fields.FRAMEWORK.value], Fields.RUNNER.value: node, @@ -343,6 +346,7 @@ def get_lowest_conc(search_space_entry): entry = { Fields.IMAGE.value: val[Fields.IMAGE.value], Fields.MODEL.value: val[Fields.MODEL.value], + Fields.MODEL_PREFIX.value: model_code, Fields.PRECISION.value: val[Fields.PRECISION.value], Fields.FRAMEWORK.value: val[Fields.FRAMEWORK.value], Fields.RUNNER.value: node, diff --git a/utils/matrix-logic/test_validation.py b/utils/matrix-logic/test_validation.py index 003278feb..008ed2b42 100644 --- a/utils/matrix-logic/test_validation.py +++ b/utils/matrix-logic/test_validation.py @@ -27,6 +27,7 @@ def valid_single_node_matrix_entry(): return { "image": "rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915", "model": "amd/DeepSeek-R1-0528-MXFP4-Preview", + "model-prefix": "dsr1", "precision": "fp4", "framework": "sglang", "spec-decoding": "none", @@ -49,6 +50,7 @@ def valid_multinode_matrix_entry(): return { "image": "nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3", "model": "deepseek-r1-fp4", + "model-prefix": "dsr1", "precision": "fp4", "framework": "dynamo-trt", "spec-decoding": "none", diff --git a/utils/matrix-logic/validation.py b/utils/matrix-logic/validation.py index 496d5003f..0ae173780 100644 --- a/utils/matrix-logic/validation.py +++ b/utils/matrix-logic/validation.py @@ -68,6 +68,7 @@ class SingleNodeMatrixEntry(BaseModel): image: str model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) precision: str framework: str spec_decoding: Literal["mtp", "draft_model", "none"] = Field( @@ -104,6 +105,7 @@ class MultiNodeMatrixEntry(BaseModel): image: str model: str + model_prefix: str = Field(alias=Fields.MODEL_PREFIX.value) precision: str framework: str spec_decoding: Literal["mtp", "draft_model", "none"] = Field( diff --git a/utils/process_result.py b/utils/process_result.py index 77f8c2a4b..558443948 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -25,10 +25,11 @@ def get_required_env_vars(required_vars): # Base required env vars base_env = get_required_env_vars([ 'RUNNER_TYPE', 'FRAMEWORK', 'PRECISION', 'SPEC_DECODING', - 'RESULT_FILENAME', 'ISL', 'OSL', 'DISAGG' + 'RESULT_FILENAME', 'ISL', 'OSL', 'DISAGG', 'MODEL_PREFIX' ]) hw = base_env['RUNNER_TYPE'] +model_prefix = base_env['MODEL_PREFIX'] framework = base_env['FRAMEWORK'] precision = base_env['PRECISION'] spec_decoding = base_env['SPEC_DECODING'] @@ -44,6 +45,7 @@ def get_required_env_vars(required_vars): 'hw': hw, 'conc': int(bmk_result['max_concurrency']), 'model': bmk_result['model_id'], + 'infmax_model_prefix': model_prefix, 'framework': framework, 'precision': precision, 'spec_decoding': spec_decoding, From 37574dd6248b339c5a95dafa37e0b72676fc5299 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 1 Dec 2025 10:54:46 -0600 Subject: [PATCH 72/98] adding tests --- utils/summarize.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/utils/summarize.py b/utils/summarize.py index 30c54e18f..be7808944 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -16,7 +16,7 @@ # Single-node and multi-node results have different field and therefore need to be printed separately if single_node_results: single_node_results.sort(key=lambda r: ( - r['model'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'])) + r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'])) print("## Single-Node Results\n") single_node_header = '''\ @@ -27,7 +27,7 @@ for result in single_node_results: print( - f"| {result['model']} " + f"| {result['infmax_model_prefix']} " f"| {result['hw'].upper()} " f"| {result['framework'].upper()} " f"| {result['precision'].upper()} " @@ -61,7 +61,7 @@ for result in multinode_results: print( - f"| {result['model']} " + f"| {result['infmax_model_prefix']} " f"| {result['hw'].upper()} " f"| {result['framework'].upper()} " f"| {result['precision'].upper()} " From 929ba0c885a7e3ba5d9041a39fd9e82d29e0a238 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Mon, 1 Dec 2025 16:36:49 -0600 Subject: [PATCH 73/98] add updates for newest gb200 merge --- .github/configs/nvidia-master.yaml | 474 ++++++++++-------- .../dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 14 +- runners/launch_gb200-nv.sh | 26 +- 3 files changed, 292 insertions(+), 222 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 052c040dd..db24d0490 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -556,217 +556,217 @@ dsr1-fp4-gb200-dynamo-trt: - isl: 8192 osl: 1024 search-space: - # MTP configurations (spec_decoding="mtp") - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - # For 8k/1k: prefill batch-size=1, max-num-tokens=8448 - - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8, 18 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=16" - - "DECODE_GPU_MEM_FRACTION=0.9" - - "DECODE_MTP_SIZE=3" - - # dep - Run Data-Expert Parallel mode (attention_dp=true) - - spec-decoding: "mtp" - conc-list: [ 128, 269 ] - prefill: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=32" - - "DECODE_MAX_BATCH_SIZE=8" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=3" - - - spec-decoding: "mtp" - conc-list: [ 538 ] - prefill: - num-worker: 8 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=16" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=3" - - - spec-decoding: "mtp" - conc-list: [ 1075 ] - prefill: - num-worker: 8 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=64" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=2" - - - spec-decoding: "mtp" - conc-list: [ 2150 ] - prefill: - num-worker: 6 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=512" - - "DECODE_MAX_BATCH_SIZE=256" - - "DECODE_GPU_MEM_FRACTION=0.8" - - "DECODE_MTP_SIZE=1" - - # Non-MTP configurations (default spec_decoding="none") - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - - conc-list: [ 1, 2, 4, 8, 16, 34 ] - prefill: - num-worker: 1 - tp: 4 - ep: 4 - dp-attn: false - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 3 - tp: 8 - ep: 8 - dp-attn: false - additional-settings: - - "DECODE_MAX_NUM_TOKENS=32" - - "DECODE_MAX_BATCH_SIZE=32" - - "DECODE_GPU_MEM_FRACTION=0.9" - - "DECODE_MTP_SIZE=0" - - # dep - Run Data-Expert Parallel mode (attention_dp=true) - - conc-list: [ 256, 538 ] - prefill: - num-worker: 4 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 32 - ep: 32 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=16" - - "DECODE_MAX_BATCH_SIZE=16" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=0" - - - conc-list: [ 1075 ] - prefill: - num-worker: 6 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=64" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=0" - - - conc-list: [ 2150 ] - prefill: - num-worker: 8 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 16 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=128" - - "DECODE_MAX_BATCH_SIZE=128" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=0" - - - conc-list: [ 2150 ] - prefill: - num-worker: 5 - tp: 4 - ep: 4 - dp-attn: true - additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=256" - - "DECODE_GPU_MEM_FRACTION=0.8" - - "DECODE_MTP_SIZE=0" + # MTP configurations (spec_decoding="mtp") + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + # For 8k/1k: prefill batch-size=1, max-num-tokens=8448 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8, 18 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=3" + + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - spec-decoding: "mtp" + conc-list: [ 128, 269 ] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=32" + - "DECODE_MAX_BATCH_SIZE=8" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=3" + + - spec-decoding: "mtp" + conc-list: [ 538 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=3" + + - spec-decoding: "mtp" + conc-list: [ 1075 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=2" + + - spec-decoding: "mtp" + conc-list: [ 2150 ] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=512" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=1" + + # Non-MTP configurations (default spec_decoding="none") + # tep - Run Tensor-Expert Parallel mode (attention_dp=false) + - conc-list: [ 1, 2, 4, 8, 16, 34 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: false + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 3 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_MAX_NUM_TOKENS=32" + - "DECODE_MAX_BATCH_SIZE=32" + - "DECODE_GPU_MEM_FRACTION=0.9" + - "DECODE_MTP_SIZE=0" + + # dep - Run Data-Expert Parallel mode (attention_dp=true) + - conc-list: [ 256, 538 ] + prefill: + num-worker: 4 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=16" + - "DECODE_MAX_BATCH_SIZE=16" + - "DECODE_GPU_MEM_FRACTION=0.7" + - "DECODE_MTP_SIZE=0" + + - conc-list: [ 1075 ] + prefill: + num-worker: 6 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=64" + - "DECODE_MAX_BATCH_SIZE=64" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" + + - conc-list: [ 2150 ] + prefill: + num-worker: 8 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=128" + - "DECODE_MAX_BATCH_SIZE=128" + - "DECODE_GPU_MEM_FRACTION=0.75" + - "DECODE_MTP_SIZE=0" + + - conc-list: [ 2150 ] + prefill: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + - "PREFILL_MAX_NUM_TOKENS=8448" + - "PREFILL_MAX_BATCH_SIZE=1" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "DECODE_MAX_NUM_TOKENS=256" + - "DECODE_MAX_BATCH_SIZE=256" + - "DECODE_GPU_MEM_FRACTION=0.8" + - "DECODE_MTP_SIZE=0" dsr1-fp8-gb200-dynamo-sglang: image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 @@ -781,9 +781,51 @@ dsr1-fp8-gb200-dynamo-sglang: - isl: 1024 osl: 1024 search-space: + # "Top of curve" (2 prefill workers each at DEP8 and 1 decode worker at DEP32) - spec-decoding: "none" - # conc-list: [ 1024, 2048, 4096, 4608, 4864, 4992, 5120, 5376, 5632, 6144, 8192 ] - conc-list: [ 1024, 2048 ] + conc-list: [ 4096 ] + prefill: + num-worker: 2 + # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: + # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=4" + - "N_ADDITIONAL_FRONTENDS=9" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" + + # "Bottom of curve" (1 prefill worker at DEP4 and 4 decode workers at DEP4) + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 64, 128 ] + prefill: + num-worker: 1 + # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: + # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=9" + decode: + num-worker: 4 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=4" + + # "Middle of curve" (3 prefill workers each at DEP8 and 1 decode worker at DEP48) + - spec-decoding: "none" + conc-list: [ 1024, 2048, 4096 ] prefill: num-worker: 3 # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: @@ -793,7 +835,7 @@ dsr1-fp8-gb200-dynamo-sglang: dp-attn: true additional-settings: - "PREFILL_NODES=6" - - "N_ADDITIONAL_FRONTENDS=8" + - "N_ADDITIONAL_FRONTENDS=9" decode: num-worker: 1 tp: 1 @@ -801,12 +843,12 @@ dsr1-fp8-gb200-dynamo-sglang: dp-attn: true additional-settings: - "DECODE_NODES=12" + - isl: 8192 osl: 1024 search-space: - spec-decoding: "none" - # conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] - conc-list: [ 128 ] + conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] prefill: num-worker: 6 # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index 9f611ef32..0ddf08b22 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -12,8 +12,18 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." -git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git -cd "dynamo/components/backends/sglang/slurm_jobs" +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git +else + git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git +fi + +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" +else + SGL_SLURM_JOBS_PATH="dynamo/components/backends/sglang/slurm_jobs" +fi +cd "$SGL_SLURM_JOBS_PATH" # Set up SGL launch script-specific environment variables export TIME_LIMIT="04:00:00" diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index e4d3827c3..384fef28b 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -13,7 +13,12 @@ export NTASKS_PER_NODE=4 ### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars if [[ $FRAMEWORK == "dynamo-sglang" ]]; then - export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh" + # Set IMAGE based on ISL/OSL + if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + export IMAGE="/mnt/lustre01/artifacts/containers/lmsysorg+sglang+v0.5.5.post2.sqsh" + else + export IMAGE="/mnt/lustre01/artifacts/containers/dynamo-sglang.sqsh" + fi export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" else @@ -104,9 +109,22 @@ if [[ $FRAMEWORK == "dynamo-trt" ]]; then fi done else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement - # Find the latest log directory - # we do "tail -1" here since only the latest job will yield the result - LOGS_DIR=$(find . -path "*/logs/*/vllm_isl_${ISL}_osl_${OSL}" -type d 2>/dev/null | sort -V | tail -1) + # Find the latest log directory that contains the data + cat > collect_latest_results.py <<'PY' +import os, sys +isl, osl, nexp = [int(x) for x in sys.argv[1:]] +for path in sorted([f"logs/{name}/vllm_isl_{isl}_osl_{osl}" for name in os.listdir("logs/") if os.path.isdir(f"logs/{name}/vllm_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: + print(path) +PY + + # This isn't ideal but for now, this is needed for the collect_latest_results.py script + if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + NUMBER_OF_EXPERIMENTS=3 + else + NUMBER_OF_EXPERIMENTS=1 + fi + + LOGS_DIR=$(python3 collect_latest_results.py $ISL $OSL $NUMBER_OF_EXPERIMENTS) if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 From c95a3e47c2857d435fede9bb61459386a146ce70 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Dec 2025 11:42:35 -0600 Subject: [PATCH 74/98] add updates for newest gb200 merge pt 2 --- benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 7 +------ runners/launch_gb200-nv.sh | 14 +++++++++++--- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index 0ddf08b22..c48813439 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -8,7 +8,7 @@ source "$(dirname "$0")/benchmark_lib.sh" check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ - PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS + PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS SGL_SLURM_JOBS_PATH # SGL_SLURM_JOBS_PATH FIXME # Always clone and setup Dynamo echo "Cloning Dynamo repository..." @@ -18,11 +18,6 @@ else git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git fi -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" -else - SGL_SLURM_JOBS_PATH="dynamo/components/backends/sglang/slurm_jobs" -fi cd "$SGL_SLURM_JOBS_PATH" # Set up SGL launch script-specific environment variables diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 5eccafb41..193da620f 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -24,6 +24,14 @@ if [[ $FRAMEWORK == "dynamo-sglang" ]]; then fi export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" + + # FIXME: Another workaround for all the different branching + # THIS NEEDS TO BE STANDARDIZED ASAP + if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" + else + export SGL_SLURM_JOBS_PATH="dynamo/components/backends/sglang/slurm_jobs" + fi else SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" @@ -115,8 +123,8 @@ else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement # Find the latest log directory that contains the data cat > collect_latest_results.py <<'PY' import os, sys -isl, osl, nexp = [int(x) for x in sys.argv[1:]] -for path in sorted([f"logs/{name}/vllm_isl_{isl}_osl_{osl}" for name in os.listdir("logs/") if os.path.isdir(f"logs/{name}/vllm_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: +sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) +for path in sorted([f"{sgl_job_dir}/logs/{name}/vllm_isl_{isl}_osl_{osl}" for name in os.listdir(f"{sgl_job_dir}/logs/") if os.path.isdir(f"{sgl_job_dir}/logs/{name}/vllm_isl_{isl}_osl_{osl}")], key=os.path.getmtime, reverse=True)[:nexp]: print(path) PY @@ -127,7 +135,7 @@ PY NUMBER_OF_EXPERIMENTS=1 fi - LOGS_DIR=$(python3 collect_latest_results.py $ISL $OSL $NUMBER_OF_EXPERIMENTS) + LOGS_DIR=$(python3 collect_latest_results.py "$SGL_SLURM_JOBS_PATH" $ISL $OSL $NUMBER_OF_EXPERIMENTS) if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 From 5eb8ea1ba646a95c7bf64f22fbe3c89743f69f77 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Dec 2025 14:53:53 -0600 Subject: [PATCH 75/98] move ntasks per node to framework level instead of runner level --- benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh | 3 ++- runners/launch_gb200-nv.sh | 2 -- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh index 51037c58f..e259402ab 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -39,10 +39,11 @@ fi kind=dynamo_disagg additional_slurm_args="--time=04:00:00" +ntasks_per_node=4 gen_nodes=$(((DECODE_TP + 3)/4 * DECODE_NUM_WORKERS)) total_nodes=$((PREFILL_NUM_WORKERS + gen_nodes)) -total_tasks=$((total_nodes * NTASKS_PER_NODE)) +total_tasks=$((total_nodes * ntasks_per_node)) decode_eplb_num_slots=0 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index 193da620f..f00ea6d03 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -8,8 +8,6 @@ set -x export SLURM_PARTITION="batch" export SLURM_ACCOUNT="benchmark" export SLURM_JOB_NAME="benchmark-dynamo.job" -# For GB200 we have 4 GPUs per node -export NTASKS_PER_NODE=4 # For SGLang - we are working on updating the 8k1k configs # For now we add conditionals to this script to use newer code for the 1k1k configs From 711783f6ef1df7ee03e7530239755e16a2d14d7d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Dec 2025 14:57:24 -0600 Subject: [PATCH 76/98] nexp hard coded to 1: --- runners/launch_gb200-nv.sh | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index f00ea6d03..d9164469e 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -126,14 +126,7 @@ for path in sorted([f"{sgl_job_dir}/logs/{name}/vllm_isl_{isl}_osl_{osl}" for na print(path) PY - # This isn't ideal but for now, this is needed for the collect_latest_results.py script - if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - NUMBER_OF_EXPERIMENTS=3 - else - NUMBER_OF_EXPERIMENTS=1 - fi - - LOGS_DIR=$(python3 collect_latest_results.py "$SGL_SLURM_JOBS_PATH" $ISL $OSL $NUMBER_OF_EXPERIMENTS) + LOGS_DIR=$(python3 collect_latest_results.py "$SGL_SLURM_JOBS_PATH" $ISL $OSL 1) if [ -z "$LOGS_DIR" ]; then echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" exit 1 From 0a19e436383a7833e3f11077c816eaa305242052 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Dec 2025 15:17:17 -0600 Subject: [PATCH 77/98] add AMD configs to full sweep --- .github/workflows/full-sweep-1k1k-scheduler.yml | 6 +++--- .github/workflows/full-sweep-1k8k-scheduler.yml | 8 ++++---- .github/workflows/full-sweep-8k1k-scheduler.yml | 8 ++++---- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 0aac4a598..0068817b6 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -16,7 +16,7 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT @@ -33,8 +33,8 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix gptoss ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix gptoss ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index 911c59c87..5699988ce 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -16,8 +16,8 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT @@ -33,8 +33,8 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index 67bfeaa9a..55bcbea6b 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -16,8 +16,8 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT @@ -33,8 +33,8 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT From d0412788fd5e435850e7bbcaf70f4f2a6e9e1b66 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Dec 2025 15:19:19 -0600 Subject: [PATCH 78/98] shut the line counter workflow up haha --- .github/workflows/pr-line-counter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-line-counter.yml b/.github/workflows/pr-line-counter.yml index c616493ed..c856577de 100644 --- a/.github/workflows/pr-line-counter.yml +++ b/.github/workflows/pr-line-counter.yml @@ -4,7 +4,7 @@ on: pull_request: types: [opened, synchronize, reopened, ready_for_review] paths: - - 'utils/matrix-logic/**' + - 'utils/matrix-logic/generate_sweep_configs.py' permissions: contents: read From a604573cb113d1c8255ecefba4b12076692f8c2b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Dec 2025 15:20:07 -0600 Subject: [PATCH 79/98] shut the line counter workflow up haha --- .github/workflows/pr-line-counter.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/pr-line-counter.yml b/.github/workflows/pr-line-counter.yml index c856577de..399b35f29 100644 --- a/.github/workflows/pr-line-counter.yml +++ b/.github/workflows/pr-line-counter.yml @@ -74,7 +74,7 @@ jobs: - name: Generate summary run: | - echo "## 📊 Line Count Report" >> $GITHUB_STEP_SUMMARY + echo "## Line Count Report" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY FILE_EXISTS="${{ steps.line-count.outputs.file_exists }}" @@ -94,20 +94,20 @@ jobs: echo "**Base Lines:** $BASE_LINE_COUNT" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY if [ "$LINE_DIFF" -gt 0 ]; then - echo "**Change:** +$LINE_DIFF lines 📈" >> $GITHUB_STEP_SUMMARY + echo "**Change:** +$LINE_DIFF lines" >> $GITHUB_STEP_SUMMARY elif [ "$LINE_DIFF" -lt 0 ]; then - echo "**Change:** $LINE_DIFF lines 📉" >> $GITHUB_STEP_SUMMARY + echo "**Change:** $LINE_DIFF lines" >> $GITHUB_STEP_SUMMARY else - echo "**Change:** No change ➡️" >> $GITHUB_STEP_SUMMARY + echo "**Change:** No change" >> $GITHUB_STEP_SUMMARY fi else echo "" >> $GITHUB_STEP_SUMMARY echo "**Base Lines:** 0 (new file)" >> $GITHUB_STEP_SUMMARY echo "" >> $GITHUB_STEP_SUMMARY - echo "**Change:** +$LINE_DIFF lines 📈" >> $GITHUB_STEP_SUMMARY + echo "**Change:** +$LINE_DIFF lines" >> $GITHUB_STEP_SUMMARY fi else - echo "⚠️ **File not found:** \`$TARGET_FILE\`" >> $GITHUB_STEP_SUMMARY + echo "**File not found:** \`$TARGET_FILE\`" >> $GITHUB_STEP_SUMMARY fi - name: Comment on PR From feae71719c3d3180c4684a0642a6edd7a8ae7de9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Dec 2025 15:21:07 -0600 Subject: [PATCH 80/98] shut the line counter workflow up haha pt 2 --- .github/workflows/pr-line-counter.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/pr-line-counter.yml b/.github/workflows/pr-line-counter.yml index 399b35f29..a01de5629 100644 --- a/.github/workflows/pr-line-counter.yml +++ b/.github/workflows/pr-line-counter.yml @@ -2,7 +2,7 @@ name: PR Line Counter on: pull_request: - types: [opened, synchronize, reopened, ready_for_review] + types: [opened, reopened, ready_for_review] paths: - 'utils/matrix-logic/generate_sweep_configs.py' From 13a2761f0d70358a042015b507b1aa41b15c0194 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Dec 2025 15:26:37 -0600 Subject: [PATCH 81/98] updating testing logic --- .github/workflows/README.md | 2 +- .github/workflows/e2e-tests.yml | 2 +- .github/workflows/full-sweep-1k1k-scheduler.yml | 8 ++++---- .github/workflows/full-sweep-1k8k-scheduler.yml | 8 ++++---- .github/workflows/full-sweep-8k1k-scheduler.yml | 8 ++++---- .github/workflows/label-validation.yml | 2 +- .github/workflows/pr-line-counter.yml | 4 ++-- .github/workflows/test-matrix-logic.yml | 10 +++++++--- .../generate_sweep_configs.py | 0 utils/{matrix-logic => matrix_logic}/pytest.ini | 0 .../test_generate_sweep_configs.py | 0 .../{matrix-logic => matrix_logic}/test_validation.py | 0 utils/{matrix-logic => matrix_logic}/validation.py | 4 ++-- 13 files changed, 26 insertions(+), 22 deletions(-) rename utils/{matrix-logic => matrix_logic}/generate_sweep_configs.py (100%) rename utils/{matrix-logic => matrix_logic}/pytest.ini (100%) rename utils/{matrix-logic => matrix_logic}/test_generate_sweep_configs.py (100%) rename utils/{matrix-logic => matrix_logic}/test_validation.py (100%) rename utils/{matrix-logic => matrix_logic}/validation.py (99%) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 003b8809f..d5985a4b3 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -1,6 +1,6 @@ # How to Test Workflows -In order to test configurations described in `.github/configs`, the primary workflow file used is `.github/workflows/e2e-tests.yml`. As input, this workflow takes in the CLI arguments for the `utils/matrix-logic/generate_sweep_configs.py` script. The usage for this script is shown below: +In order to test configurations described in `.github/configs`, the primary workflow file used is `.github/workflows/e2e-tests.yml`. As input, this workflow takes in the CLI arguments for the `utils/matrix_logic/generate_sweep_configs.py` script. The usage for this script is shown below: ``` usage: generate_sweep_configs.py [-h] {full-sweep,test-config,runner-model-sweep,runner-sweep,custom} ... diff --git a/.github/workflows/e2e-tests.yml b/.github/workflows/e2e-tests.yml index b4398b49c..7025a2a5c 100644 --- a/.github/workflows/e2e-tests.yml +++ b/.github/workflows/e2e-tests.yml @@ -25,7 +25,7 @@ jobs: - id: get-jobs run: | pip install pydantic - CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py \ + CONFIG_JSON=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py \ ${{ inputs.generate-cli-command }} \ --runner-config .github/configs/runners.yaml \ --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 0068817b6..2ae230dd6 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -16,8 +16,8 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT @@ -33,8 +33,8 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix gptoss ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix gptoss ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix gptoss ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix gptoss ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index 5699988ce..ece52599d 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -16,8 +16,8 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT @@ -33,8 +33,8 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k8k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index 55bcbea6b..49bbc2a87 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -16,8 +16,8 @@ jobs: - id: get-dsr1-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT @@ -33,8 +33,8 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix-logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 8k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 84fa6e891..56335e706 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -53,7 +53,7 @@ jobs: for label in matching: result = subprocess.run([ - 'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix-logic/generate_sweep_configs.py", + 'python3', f"{os.environ['GITHUB_WORKSPACE']}/utils/matrix_logic/generate_sweep_configs.py", 'full-sweep', '--runner-type', label['runner-type'], '--model-prefix', label['model-prefix'], diff --git a/.github/workflows/pr-line-counter.yml b/.github/workflows/pr-line-counter.yml index a01de5629..b9f4bf6b1 100644 --- a/.github/workflows/pr-line-counter.yml +++ b/.github/workflows/pr-line-counter.yml @@ -4,7 +4,7 @@ on: pull_request: types: [opened, reopened, ready_for_review] paths: - - 'utils/matrix-logic/generate_sweep_configs.py' + - 'utils/matrix_logic/generate_sweep_configs.py' permissions: contents: read @@ -17,7 +17,7 @@ jobs: contents: read pull-requests: write env: - TARGET_FILE: utils/matrix-logic/generate_sweep_configs.py + TARGET_FILE: utils/matrix_logic/generate_sweep_configs.py steps: - name: Checkout code diff --git a/.github/workflows/test-matrix-logic.yml b/.github/workflows/test-matrix-logic.yml index 38cfef95e..a2201c1cf 100644 --- a/.github/workflows/test-matrix-logic.yml +++ b/.github/workflows/test-matrix-logic.yml @@ -3,7 +3,7 @@ name: Test Matrix Logic on: pull_request: paths: - - 'utils/matrix-logic/**' + - 'utils/matrix_logic/**' permissions: contents: read @@ -29,8 +29,12 @@ jobs: python -m pip install --upgrade pip pip install pytest pydantic pyyaml - - name: Run pytest + - name: test_generate_sweep_configs tests run: | - cd utils/matrix-logic + cd utils/matrix_logic pytest test_generate_sweep_configs.py -v + + - name: test_validation tests + run: | + cd utils/matrix_logic pytest test_validation.py -v diff --git a/utils/matrix-logic/generate_sweep_configs.py b/utils/matrix_logic/generate_sweep_configs.py similarity index 100% rename from utils/matrix-logic/generate_sweep_configs.py rename to utils/matrix_logic/generate_sweep_configs.py diff --git a/utils/matrix-logic/pytest.ini b/utils/matrix_logic/pytest.ini similarity index 100% rename from utils/matrix-logic/pytest.ini rename to utils/matrix_logic/pytest.ini diff --git a/utils/matrix-logic/test_generate_sweep_configs.py b/utils/matrix_logic/test_generate_sweep_configs.py similarity index 100% rename from utils/matrix-logic/test_generate_sweep_configs.py rename to utils/matrix_logic/test_generate_sweep_configs.py diff --git a/utils/matrix-logic/test_validation.py b/utils/matrix_logic/test_validation.py similarity index 100% rename from utils/matrix-logic/test_validation.py rename to utils/matrix_logic/test_validation.py diff --git a/utils/matrix-logic/validation.py b/utils/matrix_logic/validation.py similarity index 99% rename from utils/matrix-logic/validation.py rename to utils/matrix_logic/validation.py index 0ae173780..30012423a 100644 --- a/utils/matrix-logic/validation.py +++ b/utils/matrix_logic/validation.py @@ -52,7 +52,7 @@ class Fields(Enum): """ - Below is the validation logic for the OUTPUT of utils/matrix-logic/generate_sweep_configs.py, i.e., + Below is the validation logic for the OUTPUT of utils/matrix_logic/generate_sweep_configs.py, i.e., the input to the actual workflow files. The validation enforces a strict set of rules on the structure of the generated matrix entries to ensure correctness before proceeding with benchmarking. This ensures that no validation has to happen in the workflow itself, i.e., at runtime, it is assumed that all inputs @@ -140,7 +140,7 @@ def validate_matrix_entry(entry: dict, is_multinode: bool) -> dict: """ - Below is the validation logic for the INPUT to utils/matrix-logic/generate_sweep_configs.py, i.e., + Below is the validation logic for the INPUT to utils/matrix_logic/generate_sweep_configs.py, i.e., the master configuration files found in .github/configs. The validation enforces a strict set of rules on the structure of the master configuration files to ensure correctness before proceeding with matrix generation. From 8811b2d8e000a12cfb00b04b6af3ef8f542eaca9 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Dec 2025 15:27:32 -0600 Subject: [PATCH 82/98] add model prefix to label validator --- .github/workflows/label-validation.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/label-validation.yml b/.github/workflows/label-validation.yml index 56335e706..2602d4e8c 100644 --- a/.github/workflows/label-validation.yml +++ b/.github/workflows/label-validation.yml @@ -95,6 +95,7 @@ jobs: runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} + model-prefix: ${{ matrix.config.model-prefix }} framework: ${{ matrix.config.framework }} precision: ${{ matrix.config.precision }} tp: ${{ matrix.config.tp }} From 2a8626c26bdc9911d6ea56f30bec851c66e75687 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Dec 2025 15:28:58 -0600 Subject: [PATCH 83/98] add more descriptive name to tests --- .github/workflows/test-matrix-logic.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-matrix-logic.yml b/.github/workflows/test-matrix-logic.yml index a2201c1cf..79011e824 100644 --- a/.github/workflows/test-matrix-logic.yml +++ b/.github/workflows/test-matrix-logic.yml @@ -1,4 +1,5 @@ name: Test Matrix Logic +run-name: "Config Parsing Pytests PR #${{ github.event.pull_request.number }}" on: pull_request: From d126dca190cddd98bf4d9b335392150adefe1625 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Tue, 2 Dec 2025 15:41:02 -0600 Subject: [PATCH 84/98] update test for process results --- utils/test_process_result.py | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/test_process_result.py b/utils/test_process_result.py index e223e256a..2a6389a78 100644 --- a/utils/test_process_result.py +++ b/utils/test_process_result.py @@ -46,6 +46,7 @@ def base_env_vars(): "ISL": "1024", "OSL": "1024", "DISAGG": "false", + "MODEL_PREFIX": "dsr1", } From 7ee5bdb25307fd1bb397db2caaea0b43f6b6a48d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Dec 2025 10:38:17 -0600 Subject: [PATCH 85/98] add script mode --- .github/configs/nvidia-master.yaml | 1 + benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 861efb20c..df86c66a0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -840,6 +840,7 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=6" - "N_ADDITIONAL_FRONTENDS=9" + - "SCRIPT_MODE=1p_4d" decode: num-worker: 1 tp: 1 diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index c48813439..99e2c7afd 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -35,4 +35,5 @@ bash ./submit_disagg.sh $PREFILL_NODES \ $DECODE_NODES \ $DECODE_NUM_WORKERS \ $N_ADDITIONAL_FRONTENDS \ - $ISL $OSL "${CONC_LIST// /x}" inf \ No newline at end of file + $ISL $OSL "${CONC_LIST// /x}" inf \ + $SCRIPT_MODE \ No newline at end of file From f22cf47d3a321bbd4e7ae8233c39a445e6aaf88e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Wed, 3 Dec 2025 16:01:47 -0600 Subject: [PATCH 86/98] fix bug --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index df86c66a0..c4370f483 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -819,6 +819,7 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=1" - "N_ADDITIONAL_FRONTENDS=9" + - "SCRIPT_MODE=1p_4d" decode: num-worker: 4 tp: 1 @@ -840,7 +841,6 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=6" - "N_ADDITIONAL_FRONTENDS=9" - - "SCRIPT_MODE=1p_4d" decode: num-worker: 1 tp: 1 From efcb4e477de4783465e8177aec3477e5da07983a Mon Sep 17 00:00:00 2001 From: ishandhanani <82981111+ishandhanani@users.noreply.github.com> Date: Thu, 4 Dec 2025 07:07:20 -0800 Subject: [PATCH 87/98] sglang: add fp8 8k1k and fp4 1k1k (#274) * go * typo * typo... * more --- .github/configs/nvidia-master.yaml | 103 +++++++++++++++++- .../dsr1_fp4_gb200_dynamo-sglang_slurm.sh | 37 +++++++ .../dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 8 +- 3 files changed, 137 insertions(+), 11 deletions(-) create mode 100644 benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c4370f483..71fb257e8 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -798,6 +798,7 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=4" - "N_ADDITIONAL_FRONTENDS=9" + - "SCRIPT_MODE=max-tpt" decode: num-worker: 1 tp: 1 @@ -852,17 +853,16 @@ dsr1-fp8-gb200-dynamo-sglang: - isl: 8192 osl: 1024 search-space: + # Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4) - spec-decoding: "none" - conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] + conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512 ] prefill: - num-worker: 6 - # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: - # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh + num-worker: 1 tp: 1 ep: 1 dp-attn: true additional-settings: - - "PREFILL_NODES=12" + - "PREFILL_NODES=1" - "N_ADDITIONAL_FRONTENDS=8" decode: num-worker: 1 @@ -870,4 +870,95 @@ dsr1-fp8-gb200-dynamo-sglang: ep: 1 dp-attn: true additional-settings: - - "DECODE_NODES=6" + - "DECODE_NODES=1" + + # Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32) + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048, 6144 ] + prefill: + num-worker: 5 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" + +dsr1-fp4-gb200-dynamo-sglang: + # TODO: swap + image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 + # TODO: what is the right name? + model: deepseek-ai/DeepSeek-R1-0528-fp4-v2 + model-prefix: dsr1 + runner: gb200 + precision: fp4 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4) + - spec-decoding: "none" + conc-list: [ 4, 8, 32, 64, 128, 112, 128, 256 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=2" + + # Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48) + - spec-decoding: "none" + conc-list: [ 512, 1024, 2048, 4096, 8192 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=12" + + # Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32) + - spec-decoding: "none" + conc-list: [ 8192, 12000, 15000 ] + prefill: + num-worker: 1 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "PREFILL_NODES=1" + - "N_ADDITIONAL_FRONTENDS=8" + decode: + num-worker: 2 + tp: 1 + ep: 1 + dp-attn: true + additional-settings: + - "DECODE_NODES=8" \ No newline at end of file diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh new file mode 100644 index 000000000..7a668f30c --- /dev/null +++ b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh @@ -0,0 +1,37 @@ + +#!/bin/bash + +set -x + +source "$(dirname "$0")/benchmark_lib.sh" + +check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ + PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ + DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ + PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS SGL_SLURM_JOBS_PATH # SGL_SLURM_JOBS_PATH FIXME + +# Always clone and setup Dynamo +echo "Cloning Dynamo repository..." +git clone --branch ishan/sa-1.1-sgl-dsr1 https://github.com/ai-dynamo/dynamo.git + +cd "$SGL_SLURM_JOBS_PATH" + +# Set up SGL launch script-specific environment variables +export TIME_LIMIT="04:00:00" +export MODEL_PATH=$MODEL_PATH +export CONFIG_DIR=$CONFIG_DIR +export CONTAINER_IMAGE=$IMAGE +export GPU_TYPE="gb200-fp4" + +# Launch jobs based on ISL/OSL +# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented +# by a list of numbers delimted by 'x'. This is because of how the underlying launch script +# expects the concurrencies. +bash ./submit_disagg.sh $PREFILL_NODES \ + $PREFILL_NUM_WORKERS \ + $DECODE_NODES \ + $DECODE_NUM_WORKERS \ + $N_ADDITIONAL_FRONTENDS \ + $ISL $OSL "${CONC_LIST// /x}" inf \ + $GPU_TYPE \ + $SCRIPT_MODE \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index 99e2c7afd..4e44b0414 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -12,11 +12,7 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git -else - git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git -fi +git clone --branch ishan/sa-1.1-sgl-dsr1 https://github.com/ai-dynamo/dynamo.git cd "$SGL_SLURM_JOBS_PATH" @@ -25,6 +21,7 @@ export TIME_LIMIT="04:00:00" export MODEL_PATH=$MODEL_PATH export CONFIG_DIR=$CONFIG_DIR export CONTAINER_IMAGE=$IMAGE +export GPU_TYPE="gb200-fp8" # Launch jobs based on ISL/OSL # Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented @@ -36,4 +33,5 @@ bash ./submit_disagg.sh $PREFILL_NODES \ $DECODE_NUM_WORKERS \ $N_ADDITIONAL_FRONTENDS \ $ISL $OSL "${CONC_LIST// /x}" inf \ + $GPU_TYPE \ $SCRIPT_MODE \ No newline at end of file From 21ec1337125639d9871b1f0a3750980d5224579c Mon Sep 17 00:00:00 2001 From: Cameron Quilici Date: Thu, 4 Dec 2025 10:25:44 -0600 Subject: [PATCH 88/98] Revert "sglang: add fp8 8k1k and fp4 1k1k (#274)" (#283) This reverts commit efcb4e477de4783465e8177aec3477e5da07983a. --- .github/configs/nvidia-master.yaml | 103 +----------------- .../dsr1_fp4_gb200_dynamo-sglang_slurm.sh | 37 ------- .../dsr1_fp8_gb200_dynamo-sglang_slurm.sh | 8 +- 3 files changed, 11 insertions(+), 137 deletions(-) delete mode 100644 benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 71fb257e8..c4370f483 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -798,7 +798,6 @@ dsr1-fp8-gb200-dynamo-sglang: additional-settings: - "PREFILL_NODES=4" - "N_ADDITIONAL_FRONTENDS=9" - - "SCRIPT_MODE=max-tpt" decode: num-worker: 1 tp: 1 @@ -853,112 +852,22 @@ dsr1-fp8-gb200-dynamo-sglang: - isl: 8192 osl: 1024 search-space: - # Low latency (1 prefill worker at DEP4 and 1 decode worker at DEP4) - spec-decoding: "none" - conc-list: [ 4, 8, 16, 32, 64, 128, 256, 512 ] + conc-list: [ 128, 256, 384, 448, 512, 576, 1024, 2048, 4096 ] prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - - "N_ADDITIONAL_FRONTENDS=8" - decode: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - # Middle and top of curve (5 prefill workers each at DEP8 and 1 decode worker at DEP32) - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048, 6144 ] - prefill: - num-worker: 5 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - - "N_ADDITIONAL_FRONTENDS=8" - decode: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=8" - -dsr1-fp4-gb200-dynamo-sglang: - # TODO: swap - image: nvcr.io/nvidia/ai-dynamo/sglang-runtime:0.5.1-rc0.pre1 - # TODO: what is the right name? - model: deepseek-ai/DeepSeek-R1-0528-fp4-v2 - model-prefix: dsr1 - runner: gb200 - precision: fp4 - framework: dynamo-sglang - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # Low latency (1 prefill worker at DEP4 and 2 decode workers at DEP4) - - spec-decoding: "none" - conc-list: [ 4, 8, 32, 64, 128, 112, 128, 256 ] - prefill: - num-worker: 1 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - - "N_ADDITIONAL_FRONTENDS=8" - decode: - num-worker: 2 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - # Mid curve (1 prefill worker at DEP4 and 1 decode workers at DEP48) - - spec-decoding: "none" - conc-list: [ 512, 1024, 2048, 4096, 8192 ] - prefill: - num-worker: 1 + num-worker: 6 + # tp, ep, and dp-attn do nothing because they are hardcoded in the following file: + # https://github.com/Elnifio/dynamo/blob/update-result-file-name/components/backends/sglang/slurm_jobs/scripts/gb200-fp8.sh tp: 1 ep: 1 dp-attn: true additional-settings: - - "PREFILL_NODES=1" + - "PREFILL_NODES=12" - "N_ADDITIONAL_FRONTENDS=8" decode: - num-worker: 2 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=12" - - # Top of curve (1 prefill worker at DEP4 and 1 decode worker at DEP32) - - spec-decoding: "none" - conc-list: [ 8192, 12000, 15000 ] - prefill: num-worker: 1 tp: 1 ep: 1 dp-attn: true additional-settings: - - "PREFILL_NODES=1" - - "N_ADDITIONAL_FRONTENDS=8" - decode: - num-worker: 2 - tp: 1 - ep: 1 - dp-attn: true - additional-settings: - - "DECODE_NODES=8" \ No newline at end of file + - "DECODE_NODES=6" diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh deleted file mode 100644 index 7a668f30c..000000000 --- a/benchmarks/dsr1_fp4_gb200_dynamo-sglang_slurm.sh +++ /dev/null @@ -1,37 +0,0 @@ - -#!/bin/bash - -set -x - -source "$(dirname "$0")/benchmark_lib.sh" - -check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ - PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ - DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ - PREFILL_NODES DECODE_NODES N_ADDITIONAL_FRONTENDS SGL_SLURM_JOBS_PATH # SGL_SLURM_JOBS_PATH FIXME - -# Always clone and setup Dynamo -echo "Cloning Dynamo repository..." -git clone --branch ishan/sa-1.1-sgl-dsr1 https://github.com/ai-dynamo/dynamo.git - -cd "$SGL_SLURM_JOBS_PATH" - -# Set up SGL launch script-specific environment variables -export TIME_LIMIT="04:00:00" -export MODEL_PATH=$MODEL_PATH -export CONFIG_DIR=$CONFIG_DIR -export CONTAINER_IMAGE=$IMAGE -export GPU_TYPE="gb200-fp4" - -# Launch jobs based on ISL/OSL -# Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented -# by a list of numbers delimted by 'x'. This is because of how the underlying launch script -# expects the concurrencies. -bash ./submit_disagg.sh $PREFILL_NODES \ - $PREFILL_NUM_WORKERS \ - $DECODE_NODES \ - $DECODE_NUM_WORKERS \ - $N_ADDITIONAL_FRONTENDS \ - $ISL $OSL "${CONC_LIST// /x}" inf \ - $GPU_TYPE \ - $SCRIPT_MODE \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh index 4e44b0414..99e2c7afd 100644 --- a/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh +++ b/benchmarks/dsr1_fp8_gb200_dynamo-sglang_slurm.sh @@ -12,7 +12,11 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING MODEL_PATH \ # Always clone and setup Dynamo echo "Cloning Dynamo repository..." -git clone --branch ishan/sa-1.1-sgl-dsr1 https://github.com/ai-dynamo/dynamo.git +if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then + git clone --branch ishan/sa-1.1-sgl-dsr1-fp8 https://github.com/ai-dynamo/dynamo.git +else + git clone --branch update-result-file-name https://github.com/Elnifio/dynamo.git +fi cd "$SGL_SLURM_JOBS_PATH" @@ -21,7 +25,6 @@ export TIME_LIMIT="04:00:00" export MODEL_PATH=$MODEL_PATH export CONFIG_DIR=$CONFIG_DIR export CONTAINER_IMAGE=$IMAGE -export GPU_TYPE="gb200-fp8" # Launch jobs based on ISL/OSL # Replace ' ' in CONC_LIST with 'x' such that the concurrency list is represented @@ -33,5 +36,4 @@ bash ./submit_disagg.sh $PREFILL_NODES \ $DECODE_NUM_WORKERS \ $N_ADDITIONAL_FRONTENDS \ $ISL $OSL "${CONC_LIST// /x}" inf \ - $GPU_TYPE \ $SCRIPT_MODE \ No newline at end of file From ecc2025d785f3b750e5858e01559f02a55f3a58e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Thu, 4 Dec 2025 20:44:13 -0600 Subject: [PATCH 89/98] get rid of ntasks per node required env var for sglang --- benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh index e259402ab..b5f606625 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -8,8 +8,7 @@ check_env_vars CONC_LIST ISL OSL IMAGE SPEC_DECODING \ PREFILL_NUM_WORKERS PREFILL_TP PREFILL_EP PREFILL_DP_ATTN \ DECODE_NUM_WORKERS DECODE_TP DECODE_EP DECODE_DP_ATTN \ PREFILL_MAX_NUM_TOKENS PREFILL_MAX_BATCH_SIZE DECODE_MAX_NUM_TOKENS \ - DECODE_MAX_BATCH_SIZE DECODE_GPU_MEM_FRACTION \ - NTASKS_PER_NODE + DECODE_MAX_BATCH_SIZE DECODE_GPU_MEM_FRACTION if [ "$SPEC_DECODING" == "mtp" ]; then check_env_vars DECODE_MTP_SIZE From b8d6b23497363c7ba7249c4931871430904dacae Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 5 Dec 2025 08:48:30 -0600 Subject: [PATCH 90/98] bug fix --- benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh index b5f606625..7a105158f 100644 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh +++ b/benchmarks/dsr1_fp4_gb200_dynamo-trt_slurm.sh @@ -48,7 +48,7 @@ decode_eplb_num_slots=0 sbatch --nodes=${total_nodes} \ --ntasks=${total_tasks} \ - --ntasks-per-node=${NTASKS_PER_NODE} \ + --ntasks-per-node=${ntasks_per_node} \ --segment=${total_nodes} ${additional_slurm_args} \ benchmark_disagg.slurm \ ${PREFILL_NUM_WORKERS} ${PREFILL_TP} \ From caa719714581763fbe9adc41ff8a2a7e548f0cca Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 5 Dec 2025 08:57:14 -0600 Subject: [PATCH 91/98] bug fix missing amd --- .github/workflows/full-sweep-1k1k-scheduler.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index 2ae230dd6..aabd673a7 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -17,7 +17,7 @@ jobs: run: | pip install pydantic CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT @@ -33,8 +33,8 @@ jobs: - id: get-gptoss-configs run: | pip install pydantic - CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix gptoss ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix gptoss ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix gptoss --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT From 2d55f70b80f33fecb84cd940c0fd6fccbd2a7a2b Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 5 Dec 2025 08:59:50 -0600 Subject: [PATCH 92/98] bug fix missing amd pt 2 --- .github/workflows/full-sweep-1k1k-scheduler.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index aabd673a7..d170d2c10 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -17,7 +17,7 @@ jobs: run: | pip install pydantic CONFIG_JSON_MULTI_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --multi-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) - CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml --config-files ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) + CONFIG_JSON_SINGLE_NODE=$(python3 ${GITHUB_WORKSPACE}/utils/matrix_logic/generate_sweep_configs.py full-sweep --single-node --seq-lens 1k1k --model-prefix dsr1 --config-files ${GITHUB_WORKSPACE}/.github/configs/amd-master.yaml ${GITHUB_WORKSPACE}/.github/configs/nvidia-master.yaml --runner-config ${GITHUB_WORKSPACE}/.github/configs/runners.yaml) echo "multi-node-search-space-config=$CONFIG_JSON_MULTI_NODE" >> $GITHUB_OUTPUT echo "single-node-search-space-config=$CONFIG_JSON_SINGLE_NODE" >> $GITHUB_OUTPUT From 3032a57d47f93dda2e7bfccbc02072d1d60e2601 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 5 Dec 2025 09:31:34 -0600 Subject: [PATCH 93/98] add served model name to summary --- utils/summarize.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/utils/summarize.py b/utils/summarize.py index be7808944..1f868381e 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -20,7 +20,7 @@ print("## Single-Node Results\n") single_node_header = '''\ -| Model | Hardware | Framework | Precision | ISL | OSL | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | +| Model | Served Model | Hardware | Framework | Precision | ISL | OSL | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' print(single_node_header) @@ -28,6 +28,7 @@ for result in single_node_results: print( f"| {result['infmax_model_prefix']} " + f"| {result['model']} " f"| {result['hw'].upper()} " f"| {result['framework'].upper()} " f"| {result['precision'].upper()} " @@ -49,12 +50,12 @@ print("\n") if multinode_results: - multinode_results.sort(key=lambda r: (r['model'], r['hw'], r['framework'], r['precision'], r['isl'], + multinode_results.sort(key=lambda r: (r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc'])) print("## Multi-Node Results\n") multinode_header = '''\ -| Model | Hardware | Framework | Precision | ISL | OSL | Prefill TP | Prefill EP | Prefill DP Attn | Prefill Workers | Prefill GPUs | Decode TP | Decode EP | Decode DP Attn | Decode Workers | Decode GPUs | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | +| Model | Served Model | Hardware | Framework | Precision | ISL | OSL | Prefill TP | Prefill EP | Prefill DP Attn | Prefill Workers | Prefill GPUs | Decode TP | Decode EP | Decode DP Attn | Decode Workers | Decode GPUs | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' print(multinode_header) @@ -62,6 +63,7 @@ for result in multinode_results: print( f"| {result['infmax_model_prefix']} " + f"| {result['model']} " f"| {result['hw'].upper()} " f"| {result['framework'].upper()} " f"| {result['precision'].upper()} " From 34b257f3321af2dae324b32596575bd59083764f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 5 Dec 2025 09:43:10 -0600 Subject: [PATCH 94/98] add served model name to summary pt 2 --- utils/summarize.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/utils/summarize.py b/utils/summarize.py index 1f868381e..73809c44c 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -21,7 +21,7 @@ print("## Single-Node Results\n") single_node_header = '''\ | Model | Served Model | Hardware | Framework | Precision | ISL | OSL | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | -| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ +| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' print(single_node_header) @@ -56,7 +56,7 @@ print("## Multi-Node Results\n") multinode_header = '''\ | Model | Served Model | Hardware | Framework | Precision | ISL | OSL | Prefill TP | Prefill EP | Prefill DP Attn | Prefill Workers | Prefill GPUs | Decode TP | Decode EP | Decode DP Attn | Decode Workers | Decode GPUs | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | -| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ +| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ ''' print(multinode_header) From 38814e1626b9b05e9ab04b90ff817404468fe238 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 5 Dec 2025 09:51:38 -0600 Subject: [PATCH 95/98] add served model name to summary pt 3 --- .github/workflows/collect-results.yml | 4 +- utils/summarize.py | 163 ++++++++++++++++---------- 2 files changed, 103 insertions(+), 64 deletions(-) diff --git a/.github/workflows/collect-results.yml b/.github/workflows/collect-results.yml index e4df99d1b..ca5de7fda 100644 --- a/.github/workflows/collect-results.yml +++ b/.github/workflows/collect-results.yml @@ -29,7 +29,9 @@ jobs: pattern: ${{ inputs.exp-name && format('{0}_*', inputs.exp-name) || '*' }} - name: Print summary - run: python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY + run: | + pip install tabulate + python3 utils/summarize.py results/ >> $GITHUB_STEP_SUMMARY - name: Aggregate results run: python3 utils/collect_results.py results/ ${{ inputs.exp-name || 'all' }} diff --git a/utils/summarize.py b/utils/summarize.py index 73809c44c..a46c2e02a 100644 --- a/utils/summarize.py +++ b/utils/summarize.py @@ -1,11 +1,41 @@ import sys import json from pathlib import Path +from tabulate import tabulate +# Header constants +MODEL = "Model" +SERVED_MODEL = "Served Model" +HARDWARE = "Hardware" +FRAMEWORK = "Framework" +PRECISION = "Precision" +ISL = "ISL" +OSL = "OSL" +TP = "TP" +EP = "EP" +DP_ATTENTION = "DP Attention" +CONC = "Conc" +TTFT = "TTFT (ms)" +TPOT = "TPOT (ms)" +INTERACTIVITY = "Interactivity (tok/s/user)" +E2EL = "E2EL (s)" +TPUT_PER_GPU = "TPUT per GPU" +OUTPUT_TPUT_PER_GPU = "Output TPUT per GPU" +INPUT_TPUT_PER_GPU = "Input TPUT per GPU" +PREFILL_TP = "Prefill TP" +PREFILL_EP = "Prefill EP" +PREFILL_DP_ATTN = "Prefill DP Attn" +PREFILL_WORKERS = "Prefill Workers" +PREFILL_GPUS = "Prefill GPUs" +DECODE_TP = "Decode TP" +DECODE_EP = "Decode EP" +DECODE_DP_ATTN = "Decode DP Attn" +DECODE_WORKERS = "Decode Workers" +DECODE_GPUS = "Decode GPUs" results = [] results_dir = Path(sys.argv[1]) -for result_path in results_dir.rglob(f'*.json'): +for result_path in results_dir.rglob('*.json'): with open(result_path) as f: result = json.load(f) results.append(result) @@ -13,78 +43,85 @@ single_node_results = [r for r in results if not r['is_multinode']] multinode_results = [r for r in results if r['is_multinode']] -# Single-node and multi-node results have different field and therefore need to be printed separately +# Single-node and multi-node results have different fields and therefore need to be printed separately if single_node_results: single_node_results.sort(key=lambda r: ( r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['tp'], r['ep'], r['conc'])) - print("## Single-Node Results\n") - single_node_header = '''\ -| Model | Served Model | Hardware | Framework | Precision | ISL | OSL | TP | EP | DP Attention | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | -| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ -''' - print(single_node_header) + single_node_headers = [ + MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, TP, EP, DP_ATTENTION, + CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU + ] - for result in single_node_results: - print( - f"| {result['infmax_model_prefix']} " - f"| {result['model']} " - f"| {result['hw'].upper()} " - f"| {result['framework'].upper()} " - f"| {result['precision'].upper()} " - f"| {result['isl']} " - f"| {result['osl']} " - f"| {result['tp']} " - f"| {result['ep']} " - f"| {result['dp_attention']} " - f"| {result['conc']} " - f"| {(result['median_ttft'] * 1000):.4f} " - f"| {(result['median_tpot'] * 1000):.4f} " - f"| {result['median_intvty']:.4f} " - f"| {result['median_e2el']:.4f} " - f"| {result['tput_per_gpu']:.4f} " - f"| {result['output_tput_per_gpu']:.4f} " - f"| {result['input_tput_per_gpu']:.4f} |" - ) + single_node_rows = [ + [ + r['infmax_model_prefix'], + r['model'], + r['hw'].upper(), + r['framework'].upper(), + r['precision'].upper(), + r['isl'], + r['osl'], + r['tp'], + r['ep'], + r['dp_attention'], + r['conc'], + f"{r['median_ttft'] * 1000:.4f}", + f"{r['median_tpot'] * 1000:.4f}", + f"{r['median_intvty']:.4f}", + f"{r['median_e2el']:.4f}", + f"{r['tput_per_gpu']:.4f}", + f"{r['output_tput_per_gpu']:.4f}", + f"{r['input_tput_per_gpu']:.4f}", + ] + for r in single_node_results + ] + print("## Single-Node Results\n") + print(tabulate(single_node_rows, headers=single_node_headers, tablefmt="github")) print("\n") if multinode_results: multinode_results.sort(key=lambda r: (r['infmax_model_prefix'], r['hw'], r['framework'], r['precision'], r['isl'], r['osl'], r['prefill_tp'], r['prefill_ep'], r['decode_tp'], r['decode_ep'], r['conc'])) - print("## Multi-Node Results\n") - multinode_header = '''\ -| Model | Served Model | Hardware | Framework | Precision | ISL | OSL | Prefill TP | Prefill EP | Prefill DP Attn | Prefill Workers | Prefill GPUs | Decode TP | Decode EP | Decode DP Attn | Decode Workers | Decode GPUs | Conc | TTFT (ms) | TPOT (ms) | Interactivity (tok/s/user) | E2EL (s) | TPUT per GPU | Output TPUT per GPU | Input TPUT per GPU | -| :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: | :-: |\ -''' - print(multinode_header) + multinode_headers = [ + MODEL, SERVED_MODEL, HARDWARE, FRAMEWORK, PRECISION, ISL, OSL, + PREFILL_TP, PREFILL_EP, PREFILL_DP_ATTN, PREFILL_WORKERS, PREFILL_GPUS, + DECODE_TP, DECODE_EP, DECODE_DP_ATTN, DECODE_WORKERS, DECODE_GPUS, + CONC, TTFT, TPOT, INTERACTIVITY, E2EL, TPUT_PER_GPU, OUTPUT_TPUT_PER_GPU, INPUT_TPUT_PER_GPU + ] - for result in multinode_results: - print( - f"| {result['infmax_model_prefix']} " - f"| {result['model']} " - f"| {result['hw'].upper()} " - f"| {result['framework'].upper()} " - f"| {result['precision'].upper()} " - f"| {result['isl']} " - f"| {result['osl']} " - f"| {result['prefill_tp']} " - f"| {result['prefill_ep']} " - f"| {result['prefill_dp_attention']} " - f"| {result['prefill_num_workers']} " - f"| {result['num_prefill_gpu']} " - f"| {result['decode_tp']} " - f"| {result['decode_ep']} " - f"| {result['decode_dp_attention']} " - f"| {result['decode_num_workers']} " - f"| {result['num_decode_gpu']} " - f"| {result['conc']} " - f"| {(result['median_ttft'] * 1000):.4f} " - f"| {(result['median_tpot'] * 1000):.4f} " - f"| {result['median_intvty']:.4f} " - f"| {result['median_e2el']:.4f} " - f"| {result['tput_per_gpu']:.4f} " - f"| {result['output_tput_per_gpu']:.4f} " - f"| {result['input_tput_per_gpu']:.4f} |" - ) + multinode_rows = [ + [ + r['infmax_model_prefix'], + r['model'], + r['hw'].upper(), + r['framework'].upper(), + r['precision'].upper(), + r['isl'], + r['osl'], + r['prefill_tp'], + r['prefill_ep'], + r['prefill_dp_attention'], + r['prefill_num_workers'], + r['num_prefill_gpu'], + r['decode_tp'], + r['decode_ep'], + r['decode_dp_attention'], + r['decode_num_workers'], + r['num_decode_gpu'], + r['conc'], + f"{r['median_ttft'] * 1000:.4f}", + f"{r['median_tpot'] * 1000:.4f}", + f"{r['median_intvty']:.4f}", + f"{r['median_e2el']:.4f}", + f"{r['tput_per_gpu']:.4f}", + f"{r['output_tput_per_gpu']:.4f}", + f"{r['input_tput_per_gpu']:.4f}", + ] + for r in multinode_results + ] + + print("## Multi-Node Results\n") + print(tabulate(multinode_rows, headers=multinode_headers, tablefmt="github")) From 54d0e421fe3653f1d7ea461acd2e25a6857fa489 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 5 Dec 2025 10:15:03 -0600 Subject: [PATCH 96/98] fix max model len bug --- .github/workflows/full-sweep-1k1k-scheduler.yml | 8 ++++---- .github/workflows/full-sweep-1k8k-scheduler.yml | 8 ++++---- .github/workflows/full-sweep-8k1k-scheduler.yml | 8 ++++---- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/workflows/full-sweep-1k1k-scheduler.yml b/.github/workflows/full-sweep-1k1k-scheduler.yml index d170d2c10..dabb334f6 100644 --- a/.github/workflows/full-sweep-1k1k-scheduler.yml +++ b/.github/workflows/full-sweep-1k1k-scheduler.yml @@ -51,7 +51,7 @@ jobs: with: isl: 1024 osl: 1024 - max-model-len: 2048 + max-model-len: 2248 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -89,7 +89,7 @@ jobs: exp-name: "dsr1_1k1k" isl: 1024 osl: 1024 - max-model-len: 2048 + max-model-len: 2248 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -116,7 +116,7 @@ jobs: with: isl: 1024 osl: 1024 - max-model-len: 2048 + max-model-len: 2248 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -154,7 +154,7 @@ jobs: exp-name: "gptoss_1k1k" isl: 1024 osl: 1024 - max-model-len: 2048 + max-model-len: 2248 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} diff --git a/.github/workflows/full-sweep-1k8k-scheduler.yml b/.github/workflows/full-sweep-1k8k-scheduler.yml index ece52599d..015586de8 100644 --- a/.github/workflows/full-sweep-1k8k-scheduler.yml +++ b/.github/workflows/full-sweep-1k8k-scheduler.yml @@ -51,7 +51,7 @@ jobs: with: isl: 1024 osl: 8192 - max-model-len: 2048 + max-model-len: 9416 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -89,7 +89,7 @@ jobs: exp-name: "dsr1_1k8k" isl: 1024 osl: 8192 - max-model-len: 2048 + max-model-len: 9416 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -116,7 +116,7 @@ jobs: with: isl: 1024 osl: 8192 - max-model-len: 2048 + max-model-len: 9416 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -154,7 +154,7 @@ jobs: exp-name: "gptoss_1k8k" isl: 1024 osl: 8192 - max-model-len: 2048 + max-model-len: 9416 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} diff --git a/.github/workflows/full-sweep-8k1k-scheduler.yml b/.github/workflows/full-sweep-8k1k-scheduler.yml index 49bbc2a87..7022900c7 100644 --- a/.github/workflows/full-sweep-8k1k-scheduler.yml +++ b/.github/workflows/full-sweep-8k1k-scheduler.yml @@ -51,7 +51,7 @@ jobs: with: isl: 8192 osl: 1024 - max-model-len: 2048 + max-model-len: 9416 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -89,7 +89,7 @@ jobs: exp-name: "dsr1_8k1k" isl: 8192 osl: 1024 - max-model-len: 2048 + max-model-len: 9416 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -116,7 +116,7 @@ jobs: with: isl: 8192 osl: 1024 - max-model-len: 2048 + max-model-len: 9416 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} @@ -154,7 +154,7 @@ jobs: exp-name: "gptoss_8k1k" isl: 8192 osl: 1024 - max-model-len: 2048 + max-model-len: 9416 runner: ${{ matrix.config.runner }} image: ${{ matrix.config.image }} model: ${{ matrix.config.model }} From 34870e3a42a19618ca2bd22bb1558600cdfb5f76 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 5 Dec 2025 10:25:15 -0600 Subject: [PATCH 97/98] add readme --- .github/workflows/README.md | 217 +++++++++++++++++++++++++----------- 1 file changed, 155 insertions(+), 62 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index d5985a4b3..99a33959a 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -3,70 +3,102 @@ In order to test configurations described in `.github/configs`, the primary workflow file used is `.github/workflows/e2e-tests.yml`. As input, this workflow takes in the CLI arguments for the `utils/matrix_logic/generate_sweep_configs.py` script. The usage for this script is shown below: ``` -usage: generate_sweep_configs.py [-h] {full-sweep,test-config,runner-model-sweep,runner-sweep,custom} ... +usage: generate_sweep_configs.py [-h] {full-sweep,runner-model-sweep} ... Generate benchmark configurations from YAML config files positional arguments: - {full-sweep,test-config,runner-model-sweep,runner-sweep,custom} + {full-sweep,runner-model-sweep} Available commands - full-sweep Generate full sweep configurations with optional filtering by model, precision, framework, runner type, and sequence lengths - test-config Given a config key, run that configuration as specified. Optionally specify --test-mode to only run one parallelism-concurrency pair for the config. - runner-model-sweep Given a runner type, find all configurations matching the type, and run that configuration on all individual runner nodes for the specified runner type. This is meant to validate - that all runner nodes work on all configurations for a runner type. For instance, to validate that all configs that specify an h200 runner successfully run across all h200 runner - nodes. - runner-sweep Given a model (and optionally a precision and framework), find all configurations matching the inputs, and run those configurations across all compatible runner nodes. This is - meant to validate all runner nodes that should run a particular model can. For instance, this should be used to validate that all runners nodes that should run gptoss-120b - actually do so successfully. - custom Enter custom values + full-sweep Generate full sweep configurations with optional + filtering by model, precision, framework, runner type, + and sequence lengths + runner-model-sweep Given a runner type, find all configurations matching + the type, and run that configuration on all individual + runner nodes for the specified runner type. This is + meant to validate that all runner nodes work on all + configurations for a runner type. For instance, to + validate that all configs that specify an h200 runner + successfully run across all h200 runner nodes. options: -h, --help show this help message and exit ``` -Instead of explaining each command at a high level, let's just walk through some common testing scenarios and describe how to run them. +## `full-sweep` Command -**Scenario 1**: I want to change increase the concurrency from 128 to 256 in the 1k1k scenario for the `dsr1-fp4-b200-sglang` config (from `.github/configs/nvidia-master.yaml`) and then test it. +The `full-sweep` command generates benchmark configurations with optional filtering. It requires specifying either `--single-node` or `--multi-node`. -Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the text following command as the text input: ``` -test-config --key dsr1-fp4-b200-sglang --seq-len 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +usage: generate_sweep_configs.py full-sweep + --config-files CONFIG_FILES [CONFIG_FILES ...] + --runner-config RUNNER_CONFIG + [--model-prefix MODEL_PREFIX [MODEL_PREFIX ...]] + [--precision PRECISION [PRECISION ...]] + [--framework FRAMEWORK [FRAMEWORK ...]] + [--runner-type RUNNER_TYPE [RUNNER_TYPE ...]] + [--seq-lens {1k1k,1k8k,8k1k} [{1k1k,1k8k,8k1k} ...]] + [--step-size STEP_SIZE] + [--max-conc MAX_CONC] + [--max-tp MAX_TP] + [--max-ep MAX_EP] + (--single-node | --multi-node) ``` -Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986046399 +### Examples -If we wanted to also test 1k8k or 8k1k scenarios, we would simply append `1k8k` or `8k1k` to `--seq-len`, respectively. - -Further, if we wanted to run that config on *one specific* runner node, we could specify that by appending `--runner-node` to the argument list. Note that if the specified runner node is not compatible with the specified config key (as dictated by `.github/configs/runners.yaml`), then the workflow will error: +**Test all single-node gptoss configurations on B200 with 1k1k sequence lengths:** +``` +full-sweep --single-node --model-prefix gptoss --runner-type b200 --seq-lens 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +``` +**Test all single-node fp8 precision configs for 1k8k workloads:** +``` +full-sweep --single-node --precision fp8 --seq-lens 1k8k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml ``` -test-config --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml --key dsr1-fp4-b200-sglang --seq-len 1k1k --runner-node mi300x-amd_0 -ValueError: Runner node 'mi300x-amd_0' is not compatible with config 'dsr1-fp4-b200-sglang' which runs on runner type 'b200'. Available runner nodes for this config are 'b200-nb_0, b200-nb_1, b200-nvd_0, b200-nvd_1, b200-nvd_2, b200-nvd_3, b200-tg_0'. +**Test all single-node TRT configs on H200 runners:** +``` +full-sweep --single-node --framework trt --runner-type h200 b200-trt --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml ``` -Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986053019/job/54229839736 +**Test specific single-node model on specific hardware with specific sequence lengths:** +``` +full-sweep --single-node --model-prefix dsr1 --runner-type b200 --precision fp4 --framework sglang --seq-lens 1k1k 8k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +``` -**Scenario 2**: I just made a change to the `benchmarks/dsr1_fp8_b200_docker.sh` and I need to verify that these changes work across all B200 runners. +**Limit concurrency and parallelism for faster testing:** +``` +full-sweep --single-node --max-conc 64 --max-tp 4 --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +``` -Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the text following command as the text input: +**Test all multi-node configurations:** ``` -runner-sweep --runner-type b200 --model-prefix dsr1 --precision fp8 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +full-sweep --multi-node --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml ``` -Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986283169 +## `runner-model-sweep` Command + +The `runner-model-sweep` command validates that all runner nodes of a specific type work with all model configurations. It requires specifying either `--single-node` or `--multi-node`. + +``` +usage: generate_sweep_configs.py runner-model-sweep + --config-files CONFIG_FILES [CONFIG_FILES ...] + --runner-config RUNNER_CONFIG + --runner-type RUNNER_TYPE + [--runner-node-filter RUNNER_NODE_FILTER] + (--single-node | --multi-node) +``` -This will run a test (just the highest available parallelism and lowest available concurrency) for each B200 runner node for each Deepseek config that runs on B200 with fp8 precision. I.e., this can be used to "sweep" across runners for a particular model to test that all runners still work with changes that have been made. +### Scenario: Validating Runner Infrastructure -**Scenario 3**: I just upgraded the CUDA drivers on all H200 runners and need to verify that all models that use H200 still work correctly across all H200 nodes. +I just upgraded the CUDA drivers on all H200 runners and need to verify that all models that use H200 still work correctly across all H200 nodes. Go to the GitHub Actions UI, click on the `End-to-End Tests` workflow, and enter the following command as the text input: ``` -runner-model-sweep --runner-type h200 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +runner-model-sweep --single-node --runner-type h200 --config-files .github/configs/amd-master.yaml .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml ``` -Workflow Run Example: https://github.com/InferenceMAX/InferenceMAX/actions/runs/18986292917 - This will run a test (just the highest available parallelism and lowest available concurrency) for each configuration that specifies the `h200` runner type, across all H200 runner nodes defined in `.github/configs/runners.yaml`. For example, if you have configs `dsr1-fp8-h200-sglang`, `dsr1-fp8-h200-trt`, and `gptoss-fp4-h200-vllm` that all use `runner: h200`, and you have 8 H200 nodes (`h200-cw_0`, `h200-cw_1`, etc.), this will run all 3 configs on all 8 nodes (24 total test runs). @@ -76,49 +108,110 @@ This is particularly useful when: - You've added new runner nodes and want to validate they work with all existing model configurations - You want to verify that all models remain compatible with a specific GPU type after system updates -**Key difference from Scenario 2**: -- `runner-sweep`: Fix a **model**, sweep across runners → "Does this model work on all its runners?" -- `runner-model-sweep`: Fix a **runner type**, sweep across models → "Do all models work on this runner type?" +### Filtering Runner Nodes -## Additional Use Cases with `full-sweep` - -The `full-sweep` command supports multiple filters that can be combined for targeted testing: - -**Test all gptoss configurations on B200 with 1k1k sequence lengths:** +Use `--runner-node-filter` to only test a subset of runner nodes: ``` -full-sweep --model-prefix gptoss --runner-type b200 --seq-lens 1k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +runner-model-sweep --single-node --runner-type mi300x --runner-node-filter mi300x-amd --config-files .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml ``` -**Test all fp8 precision configs across all runners for 1k8k workloads:** -``` -full-sweep --precision fp8 --seq-lens 1k8k --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml -``` +This will only include runner nodes whose names contain "mi300x-amd" -**Test all TRT configs on H200 runners:** -``` -full-sweep --framework trt --runner-type h200 b200-trt --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml -``` +## Validation Architecture -**Quick smoke test of all configs (highest TP, lowest concurrency only):** -``` -full-sweep --test-mode --config-files .github/configs/nvidia-master.yaml .github/configs/amd-master.yaml --runner-config .github/configs/runners.yaml +The benchmarking system uses a strict validation methodology to ensure correctness at every stage. This is implemented in `utils/matrix_logic/validation.py` using Pydantic models. + +### Validation Methodology + +The system validates **both ends** of the configuration pipeline: + +1. **Input Validation (Master Configs)**: Validates the structure of `.github/configs/*.yaml` files before any processing occurs +2. **Output Validation (Matrix Entries)**: Validates the generated matrix entries that are passed to workflow templates + +This dual-validation approach ensures: +- No malformed configurations enter the pipeline +- No invalid parameters reach the benchmark workflows +- Workflow templates (`benchmark-tmpl.yml`, `benchmark-multinode-tmpl.yml`) can assume all inputs are valid—no runtime validation needed + +### Input Validation: Master Config Files + +Master config files (e.g., `nvidia-master.yaml`, `amd-master.yaml`) are validated against strict Pydantic schemas: + +- **`SingleNodeMasterConfigEntry`**: Validates single-node configurations +- **`MultiNodeMasterConfigEntry`**: Validates multi-node configurations + +Each config must specify: +- Required fields: `image`, `model`, `model-prefix`, `precision`, `framework`, `runner`, `multinode` +- Sequence length configs with search spaces defining TP, EP, concurrency ranges, etc. +- Optional fields like `disagg`, `spec-decoding`, `dp-attn` + +Invalid or missing fields raise immediate validation errors before any matrix generation. + +### Output Validation: Matrix Entries + +Generated matrix entries (the actual workflow inputs) are validated against: + +- **`SingleNodeMatrixEntry`**: Matches the inputs expected by `benchmark-tmpl.yml` +- **`MultiNodeMatrixEntry`**: Matches the inputs expected by `benchmark-multinode-tmpl.yml` + +These Pydantic models mirror the workflow template input definitions exactly. For example, `benchmark-tmpl.yml` expects: +```yaml +inputs: + runner: required + image: required + model: required + model-prefix: required + precision: required + framework: required + ... ``` -**Test specific model on specific hardware with specific sequence lengths:** +The corresponding `SingleNodeMatrixEntry` enforces these same fields with appropriate types. + +### Key Design Principles + +1. **No defaults in output validation**: Matrix entry models don't set defaults. Missing values must fail validation rather than silently using fallbacks. + +2. **`extra='forbid'`**: Unknown fields are rejected, preventing typos or deprecated fields from slipping through. + +3. **Strict typing**: Fields like `spec-decoding` use `Literal["mtp", "draft_model", "none"]` to restrict values to known options. + +4. **Concurrency validation**: The system ensures either `conc-list` OR `conc-start`/`conc-end` is provided, but not both. + +### Validation Flow + ``` -full-sweep --model-prefix dsr1 --runner-type b200 --precision fp4 --framework sglang --seq-lens 1k1k 8k1k --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +.github/configs/*.yaml + │ + ▼ +┌─────────────────────────┐ +│ validate_master_config │ ← Input validation (Pydantic) +└─────────────────────────┘ + │ + ▼ +┌─────────────────────────┐ +│ generate_sweep_configs │ ← Matrix generation +└─────────────────────────┘ + │ + ▼ +┌─────────────────────────┐ +│ validate_matrix_entry │ ← Output validation (Pydantic) +└─────────────────────────┘ + │ + ▼ + benchmark-tmpl.yml or + benchmark-multinode-tmpl.yml ``` -## Custom One-off Tests +## Utility Scripts -**Scenario 4**: I want to run a quick test with a custom image, model, or configuration that isn't in the config files yet. +### `utils/summarize.py` -Use the `custom` command to specify all parameters manually: -``` -custom --runner-label b200-nb_0 --image vllm/vllm-openai:v0.11.0 --model meta-llama/Llama-3.1-70B --framework vllm --precision fp8 --exp-name llama70b_test --config-files .github/configs/nvidia-master.yaml --runner-config .github/configs/runners.yaml +Aggregates benchmark results from a directory of JSON files and outputs a markdown summary table. Used after `collect-results.yml` downloads all artifacts. + +Usage: +```bash +python utils/summarize.py ``` -This runs a single 1k1k test job with your custom parameters on the specified runner node. Useful for: -- Testing new images before adding them to config files -- Quick validation of new models -- Experimenting with different frameworks or precisions +Outputs GitHub-flavored markdown tables with metrics including TTFT, TPOT, interactivity, E2EL, and throughput per GPU for both single-node and multi-node results. From ca1c27940f6ef65022367b4b96302e8f2351b3bb Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 5 Dec 2025 14:37:23 -0600 Subject: [PATCH 98/98] add image to json result --- utils/process_result.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/utils/process_result.py b/utils/process_result.py index 558443948..0a84a1f18 100644 --- a/utils/process_result.py +++ b/utils/process_result.py @@ -25,7 +25,7 @@ def get_required_env_vars(required_vars): # Base required env vars base_env = get_required_env_vars([ 'RUNNER_TYPE', 'FRAMEWORK', 'PRECISION', 'SPEC_DECODING', - 'RESULT_FILENAME', 'ISL', 'OSL', 'DISAGG', 'MODEL_PREFIX' + 'RESULT_FILENAME', 'ISL', 'OSL', 'DISAGG', 'MODEL_PREFIX', 'IMAGE' ]) hw = base_env['RUNNER_TYPE'] @@ -37,6 +37,7 @@ def get_required_env_vars(required_vars): result_filename = base_env['RESULT_FILENAME'] isl = base_env['ISL'] osl = base_env['OSL'] +image = base_env['IMAGE'] with open(f'{result_filename}.json') as f: bmk_result = json.load(f) @@ -44,6 +45,7 @@ def get_required_env_vars(required_vars): data = { 'hw': hw, 'conc': int(bmk_result['max_concurrency']), + 'image': image, 'model': bmk_result['model_id'], 'infmax_model_prefix': model_prefix, 'framework': framework,