From a1f83d7b781fcedc7ec8caabe7095b371edd2e3b Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 27 Jan 2026 11:31:10 -0800 Subject: [PATCH 1/6] Add H200 sglang disagg configs from srtslurm - Add dsr1-fp8-h200-dynamo-sglang config to nvidia-master.yaml - Include 1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP/DEP (1P6D) - Include 8k1k configs: aggregated, TEP variants (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D) - Add perf-changelog entry for new configuration - Document recipe registration process in AGENT.md --- .github/configs/nvidia-master.yaml | 159 ++++++++++++++++++++++++++++- AGENT.md | 86 ++++++++++++++++ perf-changelog.yaml | 14 ++- 3 files changed, 257 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index f12b586f4..3b14a4213 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2829,4 +2829,161 @@ gptoss-fp4-gb200-dynamo-trt: - "DECODE_MAX_NUM_TOKENS=20000" - "DECODE_MAX_BATCH_SIZE=512" - "DECODE_GPU_MEM_FRACTION=0.9" - + + +dsr1-fp8-h200-dynamo-sglang: + image: lmsysorg/sglang:v0.5.8-cu130-runtime + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: h200-multinode-slurm + precision: fp8 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # Aggregated mode (single node TEP) + - conc-list: [1, 4, 16, 32, 64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/1k1k/bs128-agg-tp.yaml" + decode: + num-worker: 0 + tp: 8 + ep: 1 + dp-attn: false + # Low latency (1 prefill, 9 decode, TEP) + - conc-list: [1, 4, 8, 16, 32, 64, 128, 256] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/1k1k/low-latency-1p9d.yaml" + decode: + num-worker: 9 + tp: 8 + ep: 1 + dp-attn: false + # High throughput TEP (1 prefill, 6 decode) + - conc-list: [512, 1024, 2048] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-tp.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + # High throughput DEP (1 prefill, 6 decode, dp-attention) + - conc-list: [128, 256, 512, 1024, 2048] + prefill: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-dep.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 8 + dp-attn: true + - isl: 8192 + osl: 1024 + search-space: + # Aggregated mode (single node TEP) + - conc-list: [1, 4, 16, 32, 64, 128, 256] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs128-agg-tp.yaml" + decode: + num-worker: 0 + tp: 8 + ep: 1 + dp-attn: false + # Low latency TEP (1 prefill, 7 decode) + - conc-list: [1, 4, 8] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs4-1p7d.yaml" + decode: + num-worker: 7 + tp: 8 + ep: 1 + dp-attn: false + # TEP (1 prefill, 6 decode) + - conc-list: [4, 8, 16] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs8-1p6d.yaml" + decode: + num-worker: 6 + tp: 8 + ep: 1 + dp-attn: false + # TEP (1 prefill, 3 decode) + - conc-list: [8, 16, 32] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs16-1p3d.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: false + # TEP (2 prefill, 3 decode) + - conc-list: [32, 64, 128] + prefill: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs64-2p3d.yaml" + decode: + num-worker: 3 + tp: 8 + ep: 1 + dp-attn: false + # High throughput DEP (1 prefill, 1 decode, dp-attention) + - conc-list: [64, 128, 256] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/8k1k/bs128-1p1d-dep.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true diff --git a/AGENT.md b/AGENT.md index 17e491934..ecc1862f8 100644 --- a/AGENT.md +++ b/AGENT.md @@ -179,6 +179,92 @@ When working with benchmark configurations, use these valid values: 2. Create launcher script in `runners/` directory 3. Update relevant master config with new runner type +### Registering Recipes from srtslurm + +For disaggregated multi-node configurations (dynamo-sglang, dynamo-trt), recipes are stored in the external [srtslurm](https://github.com/ishandhanani/srt-slurm) repository. To stage these recipes in InferenceMAX: + +**1. Locate source recipes in srtslurm:** +```bash +# Example: H200 sglang disagg recipes +ls /path/to/srtslurm/recipes/h200/ +# 1k1k/ 8k1k/ +``` + +**2. Analyze recipe structure:** +Each recipe YAML contains: +- `name`: Recipe identifier +- `model`: Model path/container info +- `resources`: GPU type, prefill/decode node/worker counts +- `backend.sglang_config`: Prefill and decode configuration (tp-size, dp-size, ep-size, dp-attention, etc.) +- `benchmark`: ISL/OSL and concurrency settings + +**3. Add config to nvidia-master.yaml:** +```yaml +dsr1-fp8-h200-dynamo-sglang: + image: lmsysorg/sglang:v0.5.8-cu130-runtime + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: h200-multinode-slurm + precision: fp8 + framework: dynamo-sglang + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - conc-list: [1, 4, 16, 32, 64, 128, 256, 512] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "CONFIG_FILE=recipes/h200/1k1k/bs128-agg-tp.yaml" + decode: + num-worker: 0 + tp: 8 + ep: 1 + dp-attn: false +``` + +**4. Key mapping from srtslurm to nvidia-master.yaml:** + +| srtslurm field | nvidia-master.yaml field | +|----------------|-------------------------| +| `resources.prefill_workers` | `prefill.num-worker` | +| `resources.decode_workers` | `decode.num-worker` | +| `sglang_config.prefill.tp-size` | `prefill.tp` | +| `sglang_config.prefill.ep-size` | `prefill.ep` | +| `sglang_config.prefill.enable-dp-attention` | `prefill.dp-attn` | +| `benchmark.concurrencies` (parsed) | `conc-list` | +| Recipe file path | `additional-settings: CONFIG_FILE=...` | + +**5. Common patterns:** +- **Aggregated (AGG)**: Single node, `num-worker: 1` for prefill, `num-worker: 0` for decode +- **TEP (Tensor-Expert Parallel)**: `dp-attn: false`, `ep: 1` +- **DEP (Data-Expert Parallel)**: `dp-attn: true`, `ep: 8` (typically) +- **Low latency**: More decode workers (e.g., 9), lower concurrencies +- **High throughput**: Fewer decode workers, higher concurrencies + +**6. Add perf-changelog entry:** +```yaml +- config-keys: + - dsr1-fp8-h200-dynamo-sglang + description: + - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration" + - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime" + - "Recipes sourced from srtslurm repo (recipes/h200/)" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/XXX +``` + +**7. Validate configuration:** +```bash +python utils/matrix_logic/generate_sweep_configs.py full-sweep \ + --master-config .github/configs/nvidia-master.yaml \ + --framework dynamo-sglang +``` + ### Updating Docker Images When upgrading Docker images in benchmark scripts and master configs .yaml: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3ac387147..96fffb528 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -309,7 +309,7 @@ - "Includes MTP and STP configurations for 1k1k and 8k1k sequence lengths" - "Concurrency levels: 4, 8, 16, 32, 64, 128, 256, 512" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/570 - + - config-keys: - dsr1-fp4-gb200-dynamo-trt description: @@ -325,3 +325,15 @@ - "Disable torch.compile for MI355X DeepSeek-R1 FP8 SGLang" - "set cuda-graph-max-bs to CONC" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/613 + +- config-keys: + - dsr1-fp8-h200-dynamo-sglang + description: + - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration" + - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime" + - "Runner: h200-multinode-slurm with multinode and disagg enabled" + - "Recipes sourced from srtslurm repo (recipes/h200/)" + - "1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP (1P6D), DEP (1P6D)" + - "8k1k configs: aggregated, TEP configs (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D)" + - "Concurrency levels range from 1 to 2048 depending on configuration" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/582 From ed8ed217a300862a14bc49d4ac112afa61236f74 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 2 Feb 2026 10:20:41 -0800 Subject: [PATCH 2/6] Rename AGENT.md to AGENTS.md Co-Authored-By: Claude Opus 4.5 --- AGENT.md => AGENTS.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename AGENT.md => AGENTS.md (100%) diff --git a/AGENT.md b/AGENTS.md similarity index 100% rename from AGENT.md rename to AGENTS.md From 4b7646e2a646fe7a910963ff8e5d36b40378606e Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 2 Feb 2026 10:46:22 -0800 Subject: [PATCH 3/6] Add nginx container alias to h200 srtslurm config Co-Authored-By: Claude Opus 4.5 --- runners/launch_h200-dgxc-slurm.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 5965171bc..59af13d4c 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -71,6 +71,7 @@ model_paths: containers: latest: "${SQUASH_FILE}" "${CONTAINER_KEY}": "${SQUASH_FILE}" + nginx: "/data/containers/nginx+1.27.4.sqsh" EOF echo "Generated srtslurm.yaml:" From ad45ff7999595978957ff97c0aacdd2e0f6126a7 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Mon, 2 Feb 2026 23:15:14 -0800 Subject: [PATCH 4/6] temp --- runners/launch_h200-dgxc-slurm.sh | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index 59af13d4c..c3d2a3016 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -105,6 +105,16 @@ echo "Job $JOB_ID completed!" echo "Collecting results..." +# Display sweep log for debugging +SWEEP_LOG="outputs/$JOB_ID/logs/sweep_${JOB_ID}.log" +if [ -f "$SWEEP_LOG" ]; then + echo "=== Sweep Log ($SWEEP_LOG) ===" + cat "$SWEEP_LOG" + echo "=== End Sweep Log ===" +else + echo "Warning: Sweep log not found at $SWEEP_LOG" +fi + # Use the JOB_ID to find the logs directory # srtctl creates logs in outputs/JOB_ID/logs/ LOGS_DIR="outputs/$JOB_ID/logs" From c9e78c156e30f24ce450f75c0b66d1b16f1e3cf3 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 3 Feb 2026 01:06:26 -0800 Subject: [PATCH 5/6] fix --- runners/launch_h200-dgxc-slurm.sh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh index c3d2a3016..a9b55d849 100755 --- a/runners/launch_h200-dgxc-slurm.sh +++ b/runners/launch_h200-dgxc-slurm.sh @@ -81,7 +81,11 @@ echo "Running make setup..." make setup ARCH=x86_64 echo "Submitting job with srtctl..." -SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h200,dsr1,fp8,${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) +if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then + SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --setup-script fix-timeouts-x86.sh --tags "h200,dsr1,fp8,${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) +else + SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h200,dsr1,fp8,${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) +fi echo "$SRTCTL_OUTPUT" # Extract JOB_ID from srtctl output From 9b96bcc284ec12a954fcf80a2c8a9491f3474976 Mon Sep 17 00:00:00 2001 From: ishandhanani Date: Tue, 3 Feb 2026 12:13:32 -0800 Subject: [PATCH 6/6] move to cu13 container (instead of runtime use full image) --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3b14a4213..369aa5796 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2832,7 +2832,7 @@ gptoss-fp4-gb200-dynamo-trt: dsr1-fp8-h200-dynamo-sglang: - image: lmsysorg/sglang:v0.5.8-cu130-runtime + image: lmsysorg/sglang:v0.5.8-cu130 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: h200-multinode-slurm