From a1f83d7b781fcedc7ec8caabe7095b371edd2e3b Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 27 Jan 2026 11:31:10 -0800
Subject: [PATCH 1/6] Add H200 sglang disagg configs from srtslurm

- Add dsr1-fp8-h200-dynamo-sglang config to nvidia-master.yaml
- Include 1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP/DEP (1P6D)
- Include 8k1k configs: aggregated, TEP variants (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D)
- Add perf-changelog entry for new configuration
- Document recipe registration process in AGENT.md
---
 .github/configs/nvidia-master.yaml | 159 ++++++++++++++++++++++++++++-
 AGENT.md                           |  86 ++++++++++++++++
 perf-changelog.yaml                |  14 ++-
 3 files changed, 257 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index f12b586f4..3b14a4213 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2829,4 +2829,161 @@ gptoss-fp4-gb200-dynamo-trt:
         - "DECODE_MAX_NUM_TOKENS=20000"
         - "DECODE_MAX_BATCH_SIZE=512"
         - "DECODE_GPU_MEM_FRACTION=0.9"
-        
+
+
+dsr1-fp8-h200-dynamo-sglang:
+  image: lmsysorg/sglang:v0.5.8-cu130-runtime
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: h200-multinode-slurm
+  precision: fp8
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # Aggregated mode (single node TEP)
+    - conc-list: [1, 4, 16, 32, 64, 128, 256, 512]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/h200/1k1k/bs128-agg-tp.yaml"
+      decode:
+        num-worker: 0
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # Low latency (1 prefill, 9 decode, TEP)
+    - conc-list: [1, 4, 8, 16, 32, 64, 128, 256]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/h200/1k1k/low-latency-1p9d.yaml"
+      decode:
+        num-worker: 9
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # High throughput TEP (1 prefill, 6 decode)
+    - conc-list: [512, 1024, 2048]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-tp.yaml"
+      decode:
+        num-worker: 6
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # High throughput DEP (1 prefill, 6 decode, dp-attention)
+    - conc-list: [128, 256, 512, 1024, 2048]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "CONFIG_FILE=recipes/h200/1k1k/bs256-1p6d-dep.yaml"
+      decode:
+        num-worker: 6
+        tp: 8
+        ep: 8
+        dp-attn: true
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # Aggregated mode (single node TEP)
+    - conc-list: [1, 4, 16, 32, 64, 128, 256]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/h200/8k1k/bs128-agg-tp.yaml"
+      decode:
+        num-worker: 0
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # Low latency TEP (1 prefill, 7 decode)
+    - conc-list: [1, 4, 8]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/h200/8k1k/bs4-1p7d.yaml"
+      decode:
+        num-worker: 7
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # TEP (1 prefill, 6 decode)
+    - conc-list: [4, 8, 16]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/h200/8k1k/bs8-1p6d.yaml"
+      decode:
+        num-worker: 6
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # TEP (1 prefill, 3 decode)
+    - conc-list: [8, 16, 32]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/h200/8k1k/bs16-1p3d.yaml"
+      decode:
+        num-worker: 3
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # TEP (2 prefill, 3 decode)
+    - conc-list: [32, 64, 128]
+      prefill:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/h200/8k1k/bs64-2p3d.yaml"
+      decode:
+        num-worker: 3
+        tp: 8
+        ep: 1
+        dp-attn: false
+    # High throughput DEP (1 prefill, 1 decode, dp-attention)
+    - conc-list: [64, 128, 256]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/h200/8k1k/bs128-1p1d-dep.yaml"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
diff --git a/AGENT.md b/AGENT.md
index 17e491934..ecc1862f8 100644
--- a/AGENT.md
+++ b/AGENT.md
@@ -179,6 +179,92 @@ When working with benchmark configurations, use these valid values:
 2. Create launcher script in `runners/` directory
 3. Update relevant master config with new runner type
 
+### Registering Recipes from srtslurm
+
+For disaggregated multi-node configurations (dynamo-sglang, dynamo-trt), recipes are stored in the external [srtslurm](https://github.com/ishandhanani/srt-slurm) repository. To stage these recipes in InferenceMAX:
+
+**1. Locate source recipes in srtslurm:**
+```bash
+# Example: H200 sglang disagg recipes
+ls /path/to/srtslurm/recipes/h200/
+# 1k1k/  8k1k/
+```
+
+**2. Analyze recipe structure:**
+Each recipe YAML contains:
+- `name`: Recipe identifier
+- `model`: Model path/container info
+- `resources`: GPU type, prefill/decode node/worker counts
+- `backend.sglang_config`: Prefill and decode configuration (tp-size, dp-size, ep-size, dp-attention, etc.)
+- `benchmark`: ISL/OSL and concurrency settings
+
+**3. Add config to nvidia-master.yaml:**
+```yaml
+dsr1-fp8-h200-dynamo-sglang:
+  image: lmsysorg/sglang:v0.5.8-cu130-runtime
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: h200-multinode-slurm
+  precision: fp8
+  framework: dynamo-sglang
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - conc-list: [1, 4, 16, 32, 64, 128, 256, 512]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "CONFIG_FILE=recipes/h200/1k1k/bs128-agg-tp.yaml"
+      decode:
+        num-worker: 0
+        tp: 8
+        ep: 1
+        dp-attn: false
+```
+
+**4. Key mapping from srtslurm to nvidia-master.yaml:**
+
+| srtslurm field | nvidia-master.yaml field |
+|----------------|-------------------------|
+| `resources.prefill_workers` | `prefill.num-worker` |
+| `resources.decode_workers` | `decode.num-worker` |
+| `sglang_config.prefill.tp-size` | `prefill.tp` |
+| `sglang_config.prefill.ep-size` | `prefill.ep` |
+| `sglang_config.prefill.enable-dp-attention` | `prefill.dp-attn` |
+| `benchmark.concurrencies` (parsed) | `conc-list` |
+| Recipe file path | `additional-settings: CONFIG_FILE=...` |
+
+**5. Common patterns:**
+- **Aggregated (AGG)**: Single node, `num-worker: 1` for prefill, `num-worker: 0` for decode
+- **TEP (Tensor-Expert Parallel)**: `dp-attn: false`, `ep: 1`
+- **DEP (Data-Expert Parallel)**: `dp-attn: true`, `ep: 8` (typically)
+- **Low latency**: More decode workers (e.g., 9), lower concurrencies
+- **High throughput**: Fewer decode workers, higher concurrencies
+
+**6. Add perf-changelog entry:**
+```yaml
+- config-keys:
+    - dsr1-fp8-h200-dynamo-sglang
+  description:
+    - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime"
+    - "Recipes sourced from srtslurm repo (recipes/h200/)"
+  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/XXX
+```
+
+**7. Validate configuration:**
+```bash
+python utils/matrix_logic/generate_sweep_configs.py full-sweep \
+  --master-config .github/configs/nvidia-master.yaml \
+  --framework dynamo-sglang
+```
+
 ### Updating Docker Images
 
 When upgrading Docker images in benchmark scripts and master configs .yaml:
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 3ac387147..96fffb528 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -309,7 +309,7 @@
     - "Includes MTP and STP configurations for 1k1k and 8k1k sequence lengths"
     - "Concurrency levels: 4, 8, 16, 32, 64, 128, 256, 512"
   pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/570
-  
+
 - config-keys:
     - dsr1-fp4-gb200-dynamo-trt
   description:
@@ -325,3 +325,15 @@
     - "Disable torch.compile for MI355X DeepSeek-R1 FP8 SGLang"
     - "set cuda-graph-max-bs to CONC"
   pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/613
+
+- config-keys:
+    - dsr1-fp8-h200-dynamo-sglang
+  description:
+    - "Add DSR1 FP8 H200 Dynamo SGLang disaggregated multinode configuration"
+    - "Image: lmsysorg/sglang:v0.5.8-cu130-runtime"
+    - "Runner: h200-multinode-slurm with multinode and disagg enabled"
+    - "Recipes sourced from srtslurm repo (recipes/h200/)"
+    - "1k1k configs: aggregated, low-latency (1P9D), high-throughput TEP (1P6D), DEP (1P6D)"
+    - "8k1k configs: aggregated, TEP configs (1P7D, 1P6D, 1P3D, 2P3D), DEP (1P1D)"
+    - "Concurrency levels range from 1 to 2048 depending on configuration"
+  pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/582

From ed8ed217a300862a14bc49d4ac112afa61236f74 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 2 Feb 2026 10:20:41 -0800
Subject: [PATCH 2/6] Rename AGENT.md to AGENTS.md

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 AGENT.md => AGENTS.md | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename AGENT.md => AGENTS.md (100%)

diff --git a/AGENT.md b/AGENTS.md
similarity index 100%
rename from AGENT.md
rename to AGENTS.md

From 4b7646e2a646fe7a910963ff8e5d36b40378606e Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 2 Feb 2026 10:46:22 -0800
Subject: [PATCH 3/6] Add nginx container alias to h200 srtslurm config

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
---
 runners/launch_h200-dgxc-slurm.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index 5965171bc..59af13d4c 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -71,6 +71,7 @@ model_paths:
 containers:
   latest: "${SQUASH_FILE}"
   "${CONTAINER_KEY}": "${SQUASH_FILE}"
+  nginx: "/data/containers/nginx+1.27.4.sqsh"
 EOF
 
 echo "Generated srtslurm.yaml:"

From ad45ff7999595978957ff97c0aacdd2e0f6126a7 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Mon, 2 Feb 2026 23:15:14 -0800
Subject: [PATCH 4/6] temp

---
 runners/launch_h200-dgxc-slurm.sh | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index 59af13d4c..c3d2a3016 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -105,6 +105,16 @@ echo "Job $JOB_ID completed!"
 
 echo "Collecting results..."
 
+# Display sweep log for debugging
+SWEEP_LOG="outputs/$JOB_ID/logs/sweep_${JOB_ID}.log"
+if [ -f "$SWEEP_LOG" ]; then
+    echo "=== Sweep Log ($SWEEP_LOG) ==="
+    cat "$SWEEP_LOG"
+    echo "=== End Sweep Log ==="
+else
+    echo "Warning: Sweep log not found at $SWEEP_LOG"
+fi
+
 # Use the JOB_ID to find the logs directory
 # srtctl creates logs in outputs/JOB_ID/logs/
 LOGS_DIR="outputs/$JOB_ID/logs"

From c9e78c156e30f24ce450f75c0b66d1b16f1e3cf3 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 3 Feb 2026 01:06:26 -0800
Subject: [PATCH 5/6] fix

---
 runners/launch_h200-dgxc-slurm.sh | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/runners/launch_h200-dgxc-slurm.sh b/runners/launch_h200-dgxc-slurm.sh
index c3d2a3016..a9b55d849 100755
--- a/runners/launch_h200-dgxc-slurm.sh
+++ b/runners/launch_h200-dgxc-slurm.sh
@@ -81,7 +81,11 @@ echo "Running make setup..."
 make setup ARCH=x86_64
 
 echo "Submitting job with srtctl..."
-SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h200,dsr1,fp8,${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
+if [[ "$FRAMEWORK" == "dynamo-sglang" ]]; then
+    SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --setup-script fix-timeouts-x86.sh --tags "h200,dsr1,fp8,${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
+else
+    SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "h200,dsr1,fp8,${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1)
+fi
 echo "$SRTCTL_OUTPUT"
 
 # Extract JOB_ID from srtctl output

From 9b96bcc284ec12a954fcf80a2c8a9491f3474976 Mon Sep 17 00:00:00 2001
From: ishandhanani <ishandhanani@gmail.com>
Date: Tue, 3 Feb 2026 12:13:32 -0800
Subject: [PATCH 6/6] move to cu13 container (instead of runtime use full
 image)

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 3b14a4213..369aa5796 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -2832,7 +2832,7 @@ gptoss-fp4-gb200-dynamo-trt:
 
 
 dsr1-fp8-h200-dynamo-sglang:
-  image: lmsysorg/sglang:v0.5.8-cu130-runtime
+  image: lmsysorg/sglang:v0.5.8-cu130
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: h200-multinode-slurm