diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 807a7fa81..05ff9654d 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2266,160 +2266,4 @@ gptoss-fp4-gb200-dynamo-trt: - "DECODE_MAX_NUM_TOKENS=20000" - "DECODE_MAX_BATCH_SIZE=512" - "DECODE_GPU_MEM_FRACTION=0.9" - -dsr1-fp8-h200-dynamo-sglang: - image: lmsysorg/sglang:v0.5.8-cu130-runtime - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: h200-multinode-slurm - precision: fp8 - framework: dynamo-sglang - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - # Aggregated mode (single node TEP) - - conc-list: [1, 4, 16, 32, 64, 128, 256, 512] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/bs128-agg-tp.yaml" - decode: - num-worker: 0 - tp: 8 - ep: 1 - dp-attn: false - # Low latency (1 prefill, 9 decode, TEP) - - conc-list: [1, 4, 8, 16, 32, 64, 128, 256] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/low-latency-1p9d.yaml" - decode: - num-worker: 9 - tp: 8 - ep: 1 - dp-attn: false - # High throughput TEP (1 prefill, 6 decode) - - conc-list: [512, 1024, 2048] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-tp.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 1 - dp-attn: false - # High throughput DEP (1 prefill, 6 decode, dp-attention) - - conc-list: [128, 256, 512, 1024, 2048] - prefill: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-dep.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 8 - dp-attn: true - - isl: 8192 - osl: 1024 - search-space: - # Aggregated mode (single node TEP) - - conc-list: [1, 4, 16, 32, 64, 128, 256] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs128-agg-tp.yaml" - decode: - num-worker: 0 - tp: 8 - ep: 1 - dp-attn: false - # Low latency TEP (1 prefill, 7 decode) - - conc-list: [1, 4, 8] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs4-1p7d.yaml" - decode: - num-worker: 7 - tp: 8 - ep: 1 - dp-attn: false - # TEP (1 prefill, 6 decode) - - conc-list: [4, 8, 16] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs8-1p6d.yaml" - decode: - num-worker: 6 - tp: 8 - ep: 1 - dp-attn: false - # TEP (1 prefill, 3 decode) - - conc-list: [8, 16, 32] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs16-1p3d.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: false - # TEP (2 prefill, 3 decode) - - conc-list: [32, 64, 128] - prefill: - num-worker: 2 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs64-2p3d.yaml" - decode: - num-worker: 3 - tp: 8 - ep: 1 - dp-attn: false - # High throughput DEP (1 prefill, 1 decode, dp-attention) - - conc-list: [64, 128, 256] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/8k1k/bs128-1p1d-dep.yaml" - decode: - num-worker: 1 - tp: 8 - ep: 8 - dp-attn: true + \ No newline at end of file diff --git a/AGENT.md b/AGENT.md index bdd987aa0..b60f3d405 100644 --- a/AGENT.md +++ b/AGENT.md @@ -179,92 +179,6 @@ When working with benchmark configurations, use these valid values: 2. Create launcher script in `runners/` directory 3. Update relevant master config with new runner type -### Registering Recipes from srtslurm - -For disaggregated multi-node configurations (dynamo-sglang, dynamo-trt), recipes are stored in the external [srtslurm](https://github.com/ishandhanani/srt-slurm) repository. To stage these recipes in InferenceMAX: - -**1. Locate source recipes in srtslurm:** -```bash -# Example: H200 sglang disagg recipes -ls /path/to/srtslurm/recipes/h200/ -# 1k1k/ 8k1k/ -``` - -**2. Analyze recipe structure:** -Each recipe YAML contains: -- `name`: Recipe identifier -- `model`: Model path/container info -- `resources`: GPU type, prefill/decode node/worker counts -- `backend.sglang_config`: Prefill and decode configuration (tp-size, dp-size, ep-size, dp-attention, etc.) -- `benchmark`: ISL/OSL and concurrency settings - -**3. Add config to nvidia-master.yaml:** -```yaml -dsr1-fp8-h200-dynamo-sglang: - image: lmsysorg/sglang:v0.5.8-cu130-runtime - model: deepseek-ai/DeepSeek-R1-0528 - model-prefix: dsr1 - runner: h200-multinode-slurm - precision: fp8 - framework: dynamo-sglang - multinode: true - disagg: true - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - conc-list: [1, 4, 16, 32, 64, 128, 256, 512] - prefill: - num-worker: 1 - tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "CONFIG_FILE=recipes/h200/1k1k/bs128-agg-tp.yaml" - decode: - num-worker: 0 - tp: 8 - ep: 1 - dp-attn: false -``` - -**4. Key mapping from srtslurm to nvidia-master.yaml:** - -| srtslurm field | nvidia-master.yaml field | -|----------------|-------------------------| -| `resources.prefill_workers` | `prefill.num-worker` | -| `resources.decode_workers` | `decode.num-worker` | -| `sglang_config.prefill.tp-size` | `prefill.tp` | -| `sglang_config.prefill.ep-size` | `prefill.ep` | -| `sglang_config.prefill.enable-dp-attention` | `prefill.dp-attn` | -| `benchmark.concurrencies` (parsed) | `conc-list` | -| Recipe file path | `additional-settings: CONFIG_FILE=...` | - -**5. Common patterns:** -- **Aggregated (AGG)**: Single node, `num-worker: 1` for prefill, `num-worker: 0` for decode -- **TEP (Tensor-Expert Parallel)**: `dp-attn: false`, `ep: 1` -- **DEP (Data-Expert Parallel)**: `dp-attn: true`, `ep: 8` (typically) -- **Low latency**: More decode workers (e.g., 9), lower concurrencies -- **High throughput**: Fewer decode workers, higher concurrencies - -**6. Add perf-changelog entry:** -```yaml -- config-keys: - - dsr1-fp8-h200-dynamo-sglang - description: - - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration" - - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime" - - "Recipes sourced from srtslurm repo (recipes/h200/)" - pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/XXX -``` - -**7. Validate configuration:** -```bash -python utils/matrix_logic/generate_sweep_configs.py full-sweep \ - --master-config .github/configs/nvidia-master.yaml \ - --framework dynamo-sglang -``` - ### Updating Docker Images When upgrading Docker images in benchmark scripts and master configs .yaml: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c0fbb2415..90fac5724 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -283,15 +283,3 @@ - "Includes MTP and STP configurations for 1k1k and 8k1k sequence lengths" - "Concurrency levels: 4, 8, 16, 32, 64, 128, 256, 512" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/570 - -- config-keys: - - dsr1-fp8-h200-dynamo-sglang - description: - - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration" - - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime" - - "Runner: h200-multinode-slurm with multinode and disagg enabled" - - "Recipes sourced from srtslurm repo (recipes/h200/)" - - "1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP (1P6D), DEP (1P6D)" - - "8k1k configs: aggregated, TEP configs (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D)" - - "Concurrency levels range from 1 to 2048 depending on configuration" - pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/TBD