From b7b85df738e94e84ce3264d4b147b9cceaddfc3e Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Tue, 24 Feb 2026 08:46:51 +0000
Subject: [PATCH 01/17] Update Image for DSR1 FP4 MI355X SGLang Disagg

New image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2
Goal: solve the DP Attention FP4 Disagg AMD broken issue
---
 .github/configs/amd-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index d7e51cb28..d5536bd1f 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -918,7 +918,7 @@ dsr1-fp8-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.7-rocm700-mi35x-mori-fp4-0122
+  image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2
   model: amd/DeepSeek-R1-0528-MXFP4
   model-prefix: dsr1
   runner: mi355x-disagg

From df3f4e0cad549e9291e1b3a82739917db9b3f236 Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Tue, 24 Feb 2026 11:55:37 +0000
Subject: [PATCH 02/17] Turn on DP Atttention

---
 .github/configs/amd-master.yaml | 48 ++++++++++++++++-----------------
 1 file changed, 24 insertions(+), 24 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index d5536bd1f..ddcb7c0df 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -938,14 +938,14 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 1
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
         tp: 1
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=1"
@@ -957,7 +957,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -965,7 +965,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 2
         tp: 8
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=1"
@@ -976,7 +976,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 1
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -984,7 +984,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 2
         tp: 8
         ep: 1
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=2"
@@ -997,14 +997,14 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 1
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
         tp: 1
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
@@ -1016,7 +1016,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -1024,7 +1024,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 2
         tp: 8
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
@@ -1035,7 +1035,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 1
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -1043,7 +1043,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 2
         tp: 8
         ep: 1
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
@@ -1059,14 +1059,14 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 1
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
         tp: 1
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=1"
@@ -1078,7 +1078,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -1086,7 +1086,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 2
         tp: 8
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=1"
@@ -1097,7 +1097,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 1
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -1105,7 +1105,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 2
         tp: 8
         ep: 1
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=2"
@@ -1118,14 +1118,14 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 1
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
         tp: 1
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
@@ -1137,7 +1137,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -1145,7 +1145,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 2
         tp: 8
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
@@ -1156,7 +1156,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 1
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -1164,7 +1164,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 2
         tp: 8
         ep: 1
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"

From 7421f6f71bd241743f930e375d9f8cea962274f3 Mon Sep 17 00:00:00 2001
From: Chun Fang <chun.fang@amd.com>
Date: Tue, 24 Feb 2026 18:24:46 +0000
Subject: [PATCH 03/17] Turn off dp-attn for prefill

---
 .github/configs/amd-master.yaml | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index ddcb7c0df..d4dbfe80d 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -938,7 +938,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 1
         ep: 8
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
@@ -957,7 +957,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 8
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -976,7 +976,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 1
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -997,7 +997,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 1
         ep: 8
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
@@ -1016,7 +1016,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 8
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -1035,7 +1035,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 1
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -1059,7 +1059,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 1
         ep: 8
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
@@ -1078,7 +1078,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 8
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -1097,7 +1097,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 1
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -1118,7 +1118,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 1
         ep: 8
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
@@ -1137,7 +1137,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 8
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
 
@@ -1156,7 +1156,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         num-worker: 1
         tp: 8
         ep: 1
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
 

From 5d2f28bd6591a8fc7134581a340928c28970c5bc Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 27 Feb 2026 09:53:11 +0000
Subject: [PATCH 04/17] add fp4 configs

---
 .github/configs/amd-master.yaml               | 227 +++++++++---------
 .../dsr1_fp4_mi355x_sglang-disagg.sh          |  12 +-
 .../dsr1_fp8_mi355x_sglang-disagg.sh          |   2 +-
 runners/launch_mi355x-amds.sh                 |   2 +-
 4 files changed, 133 insertions(+), 110 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index d4dbfe80d..c482c0124 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -918,7 +918,7 @@ dsr1-fp8-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2
+  image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2-fixmtp
   model: amd/DeepSeek-R1-0528-MXFP4
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -930,48 +930,29 @@ dsr1-fp4-mi355x-sglang-disagg:
   - isl: 1024
     osl: 1024
     search-space:
-    # MTP configurations
-    # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-    - spec-decoding: "mtp"
-      conc-list: [ 1024, 512, 256 ]
+    # non-MTP configurations
+    # 1P1D TP8
+    - spec-decoding: "none"
+      conc-list: [ 1, 2, 4, 8 ]
       prefill:
         num-worker: 1
-        tp: 1
-        ep: 8
+        tp: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
-        num-worker: 2
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
-
-    # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-    - spec-decoding: "mtp"
-      conc-list: [ 32, 64, 128 ]
-      prefill:
         num-worker: 1
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
-        - "PREFILL_NODES=1"
-
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
 
-    - spec-decoding: "mtp"
-      conc-list: [ 128, 64, 32, 16, 8, 4 ]
+    # 1P2D TP8
+    - spec-decoding: "none"
+      conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -979,58 +960,60 @@ dsr1-fp4-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
-
       decode:
         num-worker: 2
         tp: 8
         ep: 1
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=2"
+        - "DECODE_MTP_SIZE=0"
 
-    # non-MTP configurations
-    # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
+    # 1P2D DEP8+TEP8
     - spec-decoding: "none"
-      conc-list: [ 1024, 512, 256 ]
+      conc-list: [ 256, 512 ]
       prefill:
         num-worker: 1
-        tp: 1
+        tp: 8
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
-        tp: 1
+        tp: 8
         ep: 8
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
 
-    # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 1P1D pure TP8
     - spec-decoding: "none"
-      conc-list: [ 32, 64, 128 ]
+      conc-list: [ 1, 2, 4, 8 ]
       prefill:
         num-worker: 1
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
-
       decode:
-        num-worker: 2
+        num-worker: 1
         tp: 8
-        ep: 8
-        dp-attn: true
+        ep: 1
+        dp-attn: false
         additional-settings:
-        - "DECODE_NODES=2"
+        - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
+    # 1P2D TP8
     - spec-decoding: "none"
-      conc-list: [ 128, 64, 32, 16, 8, 4 ]
+      conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -1038,61 +1021,70 @@ dsr1-fp4-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
-
       decode:
         num-worker: 2
         tp: 8
         ep: 1
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
 
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # MTP configurations
-    # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-    - spec-decoding: "mtp"
-      conc-list: [ 1024, 512, 256 ]
+    # 2P1D DEP8+TEP8
+    - spec-decoding: "none"
+      conc-list: [ 256, 512 ]
       prefill:
-        num-worker: 1
-        tp: 1
+        num-worker: 2
+        tp: 8
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=1"
+        - "PREFILL_NODES=2"
       decode:
-        num-worker: 2
-        tp: 1
+        num-worker: 1
+        tp: 8
         ep: 8
-        dp-attn: true
+        dp-attn: false
         additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
 
-    # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
+dsr1-fp4-mi355x-sglang-disagg-mtp:
+  image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2-fixmtp
+  model: amd/DeepSeek-R1-0528-MXFP4
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 1P1D TP8
     - spec-decoding: "mtp"
-      conc-list: [ 32, 64, 128 ]
+      conc-list: [ 1, 2, 4, 8 ]
       prefill:
         num-worker: 1
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
-
       decode:
-        num-worker: 2
+        num-worker: 1
         tp: 8
-        ep: 8
-        dp-attn: true
+        ep: 1
+        dp-attn: false
         additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
 
-    - spec-decoding: "mtp"
-      conc-list: [ 128, 64, 32, 16, 8, 4 ]
+    # 1P2D TP8
+    - spec-decoding: "mtp" 
+      conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -1100,58 +1092,60 @@ dsr1-fp4-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
-
       decode:
         num-worker: 2
         tp: 8
         ep: 1
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=2"
+        - "DECODE_MTP_SIZE=3"
 
-    # non-MTP configurations
-    # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-    - spec-decoding: "none"
-      conc-list: [ 1024, 512, 256 ]
+    # 1P2D DEP8+TEP8
+    - spec-decoding: "mtp" 
+      conc-list: [ 256, 512 ]
       prefill:
         num-worker: 1
-        tp: 1
+        tp: 8
         ep: 8
-        dp-attn: false
+        dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
-        tp: 1
+        tp: 8
         ep: 8
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
+        - "DECODE_MTP_SIZE=1"
 
-    # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-    - spec-decoding: "none"
-      conc-list: [ 32, 64, 128 ]
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 1P1D pure TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1, 2, 4, 8 ]
       prefill:
         num-worker: 1
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
-
       decode:
-        num-worker: 2
+        num-worker: 1
         tp: 8
-        ep: 8
-        dp-attn: true
+        ep: 1
+        dp-attn: false
         additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
 
-    - spec-decoding: "none"
-      conc-list: [ 128, 64, 32, 16, 8, 4 ]
+    # 1P2D TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -1159,15 +1153,34 @@ dsr1-fp4-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
-
       decode:
         num-worker: 2
         tp: 8
         ep: 1
-        dp-attn: true
+        dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
+        - "DECODE_MTP_SIZE=3"
+
+    # 2P1D DEP8+TEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 256, 512 ]
+      prefill:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=2"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
 
   # FIXME(billishyahao): disable FP4 1k8k for now
   # - isl: 1024
diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
index aaaf97075..f568b2b5d 100644
--- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
+++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
@@ -33,17 +33,27 @@ cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
 # Set up SGL launch script-specific environment variables
 export TIME_LIMIT="08:00:00"
 export MODEL_PATH=$MODEL_PATH
-export MODEL_NAME="DeepSeek-R1-0528-MXFP4-Preview"
+export MODEL_NAME=$MODEL_NAME
 export CONTAINER_IMAGE=$IMAGE
 
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
 export PREFILL_ENABLE_EP=true
+fi
+
 if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
 export PREFILL_ENABLE_DP=true
 else
 export PREFILL_ENABLE_DP=false
 fi
 
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
 export DECODE_ENABLE_EP=true
+fi
+
 if [[ "$DECODE_DP_ATTN" == "true" ]]; then
 export DECODE_ENABLE_DP=true
 else
diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
index 0a5a5c30b..9a0ea92e7 100644
--- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
+++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
@@ -33,7 +33,7 @@ cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
 # Set up SGL launch script-specific environment variables
 export TIME_LIMIT="08:00:00"
 export MODEL_PATH=$MODEL_PATH
-export MODEL_NAME="DeepSeek-R1"
+export MODEL_NAME=$MODEL_NAME
 export CONTAINER_IMAGE=$IMAGE
 
 if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 2b9902b0b..f4f1e561f 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -34,7 +34,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     export SLURM_PARTITION="compute"
     export SLURM_JOB_NAME="benchmark-sglang-disagg.job"
 
-    export MODEL_NAME="DeepSeek-R1"
+    export MODEL_NAME=${MODEL##*/}
     export MODEL_PATH="/it-share/data"
     export IBDEVICES="rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7"
     export MORI_RDMA_TC=104

From b434f0a138042378b4fc4a70ab1ea4da4c0ec992 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 27 Feb 2026 10:19:35 +0000
Subject: [PATCH 05/17] add models yaml

---
 benchmarks/multi_node/amd_utils/models.yaml | 31 +++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index edfebc755..3a66eb46b 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -160,3 +160,34 @@ DeepSeek-R1-0528-MXFP4-Preview:
       max_running_requests: 128
       chunked_prefill_size: 262144
       cuda_graph_bs_range: "1-128"
+
+DeepSeek-R1-0528-MXFP4:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter"
+  mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
+  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: 16384
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 16384
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"

From 55ef22f6fdae13186a13c272ebf0541bd375c878 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sat, 28 Feb 2026 06:15:23 +0000
Subject: [PATCH 06/17] add more fp8 configs

---
 .github/configs/amd-master.yaml             | 228 ++++++--------------
 benchmarks/multi_node/amd_utils/models.yaml |  31 +++
 2 files changed, 98 insertions(+), 161 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index c482c0124..1d9dcea89 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -391,7 +391,7 @@ dsr1-fp8-mi355x-atom-mtp:
     - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp  }
 
 dsr1-fp8-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.8-rocm700-mi35x-mori-0210
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -403,9 +403,9 @@ dsr1-fp8-mi355x-sglang-disagg:
   - isl: 1024
     osl: 1024
     search-space:
-    # MTP configurations
-    # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16)
-    - spec-decoding: "mtp"
+    # non-MTP configurations
+    # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16)
+    - spec-decoding: "none"
       conc-list: [ 1024, 2048 ]
       prefill:
         num-worker: 1
@@ -416,15 +416,15 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 1
-        tp: 1
+        tp: 16
         ep: 16
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
+        - "DECODE_MTP_SIZE=0"
 
-    # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8)
-    - spec-decoding: "mtp"
+    # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8)
+    - spec-decoding: "none"
       conc-list: [ 1536, 1024, 512, 256 ]
       prefill:
         num-worker: 1
@@ -435,16 +435,16 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
-        tp: 1
+        tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
+        - "DECODE_MTP_SIZE=0"
 
 
     # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-    - spec-decoding: "mtp"
+    - spec-decoding: "none"
       conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
       prefill:
         num-worker: 1
@@ -461,13 +461,13 @@ dsr1-fp8-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=2"
+        - "DECODE_MTP_SIZE=0"
 
-    - spec-decoding: "mtp"
-      conc-list: [ 4, 2, 1 ]
+    - spec-decoding: "none"
+      conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
-        tp: 8
+        tp: 4
         ep: 1
         dp-attn: false
         additional-settings:
@@ -480,49 +480,32 @@ dsr1-fp8-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=2"
+        - "DECODE_MTP_SIZE=0"
 
+  - isl: 8192
+    osl: 1024
+    search-space:
     # non-MTP configurations
-    # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16)
+    # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8)
     - spec-decoding: "none"
       conc-list: [ 1024, 2048 ]
       prefill:
-        num-worker: 1
+        num-worker: 2
         tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 1
-        ep: 16
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-    - spec-decoding: "none"
-      conc-list: [ 1536, 1024, 512, 256 ]
-      prefill:
+        - "PREFILL_NODES=2"
+      decode:
         num-worker: 1
         tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 1
         ep: 8
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=2"
+        - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
-
-    # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
+    # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8)
     - spec-decoding: "none"
       conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
       prefill:
@@ -543,10 +526,10 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
     - spec-decoding: "none"
-      conc-list: [ 4, 2, 1 ]
+      conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
-        tp: 8
+        tp: 4
         ep: 1
         dp-attn: false
         additional-settings:
@@ -561,78 +544,53 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
-  - isl: 8192
+
+dsr1-fp8-mi355x-sglang-disagg-mtp:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
     osl: 1024
     search-space:
     # MTP configurations
+    # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16)
     - spec-decoding: "mtp"
       conc-list: [ 1024, 2048 ]
       prefill:
-        num-worker: 2
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
         num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024 ]
-      prefill:
-        num-worker: 2
         tp: 8
         ep: 1
         dp-attn: false
         additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-    # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16)
-    - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048 ]
-      prefill:
-        num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 1
-        tp: 1
+        tp: 16
         ep: 16
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=1"
 
-    # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8)
+    # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8)
     - spec-decoding: "mtp"
-      conc-list: [ 1536, 1024 ]
+      conc-list: [ 1536, 1024, 512, 256 ]
       prefill:
         num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
+        tp: 8
+        ep: 1
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
-        tp: 1
+        tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
@@ -661,10 +619,10 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=2"
 
     - spec-decoding: "mtp"
-      conc-list: [ 4, 2, 1 ]
+      conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
-        tp: 8
+        tp: 4
         ep: 1
         dp-attn: false
         additional-settings:
@@ -679,83 +637,31 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=2"
 
-    # non-MTP configurations
-    - spec-decoding: "none"
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8)
+    - spec-decoding: "mtp"
       conc-list: [ 1024, 2048 ]
       prefill:
         num-worker: 2
-        tp: 1
+        tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=2"
       decode:
         num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024 ]
-      prefill:
-        num-worker: 2
         tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 1
         ep: 8
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16)
-    - spec-decoding: "none"
-      conc-list: [ 1024, 2048 ]
-      prefill:
-        num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 1
-        ep: 16
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-    - spec-decoding: "none"
-      conc-list: [ 1536, 1024 ]
-      prefill:
-        num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
+        - "DECODE_MTP_SIZE=1"
 
-    # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-    - spec-decoding: "none"
+    # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8)
+    - spec-decoding: "mtp"
       conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
       prefill:
         num-worker: 1
@@ -772,13 +678,13 @@ dsr1-fp8-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
+        - "DECODE_MTP_SIZE=2"
 
-    - spec-decoding: "none"
-      conc-list: [ 4, 2, 1 ]
+    - spec-decoding: "mtp"
+      conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
-        tp: 8
+        tp: 4
         ep: 1
         dp-attn: false
         additional-settings:
@@ -791,7 +697,7 @@ dsr1-fp8-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
+        - "DECODE_MTP_SIZE=2"
 
   # FIXME(billishyahao): disable 1k8k for now
   # - isl: 1024
@@ -918,7 +824,7 @@ dsr1-fp8-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2-fixmtp
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
   model: amd/DeepSeek-R1-0528-MXFP4
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1050,7 +956,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2-fixmtp
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
   model: amd/DeepSeek-R1-0528-MXFP4
   model-prefix: dsr1
   runner: mi355x-disagg
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 3a66eb46b..5cfad98db 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -130,6 +130,37 @@ DeepSeek-R1:
       chunked_prefill_size: 262144
       cuda_graph_bs_range: "1-128"
 
+DeepSeek-R1-0528:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
+  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+
 DeepSeek-R1-0528-MXFP4-Preview:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"

From 618dab4d74c4e0bb6491f1150a8501a5053c6fcb Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sat, 28 Feb 2026 06:18:19 +0000
Subject: [PATCH 07/17] specify moriep normal mode

---
 benchmarks/multi_node/amd_utils/models.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 5cfad98db..5280b6767 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -40,7 +40,7 @@
 DeepSeek-V3:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true
@@ -71,7 +71,7 @@ DeepSeek-V3:
 DeepSeek-V3-0324:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true
@@ -102,7 +102,7 @@ DeepSeek-V3-0324:
 DeepSeek-R1:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true
@@ -133,7 +133,7 @@ DeepSeek-R1:
 DeepSeek-R1-0528:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true
@@ -164,7 +164,7 @@ DeepSeek-R1-0528:
 DeepSeek-R1-0528-MXFP4-Preview:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true
@@ -195,7 +195,7 @@ DeepSeek-R1-0528-MXFP4-Preview:
 DeepSeek-R1-0528-MXFP4:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true

From 08b90f01d8627e2ea11da9f1d1811eac9bcbf6d3 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sat, 28 Feb 2026 09:56:06 +0000
Subject: [PATCH 08/17] add fp4 configs

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 1d9dcea89..91ae17c17 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1009,7 +1009,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
 
     # 1P2D DEP8+TEP8
     - spec-decoding: "mtp" 
-      conc-list: [ 256, 512 ]
+      conc-list: [ 256, 512, 1024 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -1070,7 +1070,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
 
     # 2P1D DEP8+TEP8
     - spec-decoding: "mtp"
-      conc-list: [ 256, 512 ]
+      conc-list: [ 256, 512, 1024 ]
       prefill:
         num-worker: 2
         tp: 8

From 490645237e7acfef2782c398847862926fd966ee Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sat, 28 Feb 2026 11:06:31 +0000
Subject: [PATCH 09/17] fix fp4 configs

---
 .github/configs/amd-master.yaml | 84 ++++++++++++++++++++++++---------
 1 file changed, 62 insertions(+), 22 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 91ae17c17..ddaec6e57 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -445,7 +445,7 @@ dsr1-fp8-mi355x-sglang-disagg:
 
     # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
     - spec-decoding: "none"
-      conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -464,7 +464,7 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
     - spec-decoding: "none"
-      conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ]
+      conc-list: [ 2, 1 ]
       prefill:
         num-worker: 1
         tp: 4
@@ -507,7 +507,7 @@ dsr1-fp8-mi355x-sglang-disagg:
 
     # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8)
     - spec-decoding: "none"
-      conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -526,7 +526,7 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
     - spec-decoding: "none"
-      conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ]
+      conc-list: [ 128, 2, 1 ]
       prefill:
         num-worker: 1
         tp: 4
@@ -600,7 +600,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
 
     # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
     - spec-decoding: "mtp"
-      conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -619,7 +619,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
     - spec-decoding: "mtp"
-      conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ]
+      conc-list: [ 2, 1 ]
       prefill:
         num-worker: 1
         tp: 4
@@ -681,7 +681,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
     - spec-decoding: "mtp"
-      conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ]
+      conc-list: [ 4, 2, 1 ]
       prefill:
         num-worker: 1
         tp: 4
@@ -990,7 +990,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
 
     # 1P2D TP8
     - spec-decoding: "mtp" 
-      conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ]
+      conc-list: [ 2, 4, 8, 16, 32 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -1007,25 +1007,45 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=3"
 
-    # 1P2D DEP8+TEP8
+    # 1P2D TP8
     - spec-decoding: "mtp" 
-      conc-list: [ 256, 512, 1024 ]
+      conc-list: [ 64, 128, 256 ]
       prefill:
         num-worker: 1
         tp: 8
-        ep: 8
-        dp-attn: true
+        ep: 1
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
         tp: 8
-        ep: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
+
+    # 1P2D TP4
+    - spec-decoding: "mtp" 
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=1"
 
+
   - isl: 8192
     osl: 1024
     search-space:
@@ -1049,9 +1069,10 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=3"
 
+
     # 1P2D TP8
     - spec-decoding: "mtp"
-      conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ]
+      conc-list: [ 2, 4, 8, 16, 32 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -1068,23 +1089,42 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=3"
 
-    # 2P1D DEP8+TEP8
+    # 1P2D TP8
     - spec-decoding: "mtp"
-      conc-list: [ 256, 512, 1024 ]
+      conc-list: [ 64, 128, 256 ]
       prefill:
-        num-worker: 2
+        num-worker: 1
         tp: 8
-        ep: 8
-        dp-attn: true
+        ep: 1
+        dp-attn: false
         additional-settings:
-        - "PREFILL_NODES=2"
+        - "PREFILL_NODES=1"
       decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
+
+    # 1P2D TP4
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 128, 256 ]
+      prefill:
         num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
-        - "DECODE_NODES=1"
+        - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=1"
 
 

From 70a707a90057669bba23a9978ef460af2ec76524 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sat, 28 Feb 2026 15:02:18 +0000
Subject: [PATCH 10/17] add change log

---
 perf-changelog.yaml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 9c4c9e438..74790a5f5 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -733,4 +733,16 @@
     - "Extend concurrency range to conc-end: 256 across all sequence lengths (1k1k, 1k8k, 8k1k)"
     - "Fix MTP 1k8k conc-start from 256 to 4 to enable full concurrency sweep"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/699
+
+- config-keys:
+    - dsr1-fp8-mi355x-sglang-disagg
+    - dsr1-fp8-mi355x-sglang-disagg-mtp
+    - dsr1-fp4-mi355x-sglang-disagg
+    - dsr1-fp4-mi355x-sglang-disagg-mtp
+  description:
+    - "Add more sweep configs for MI355X FP8/FP4 Disagg"
+    - "Add TP/DP/EP size < 8 support "
+    - "Support DSR1-0528 MTP Disagg"
+    - "Bump SGL mori image to Feb 27"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823
   

From 52f97ca1f12742d577af9c4e2cc44326ec6a3769 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sun, 1 Mar 2026 13:20:39 +0000
Subject: [PATCH 11/17] add tp4 support

---
 benchmarks/multi_node/amd_utils/server.sh              |  5 +++--
 benchmarks/multi_node/amd_utils/submit.sh              | 10 ++++++----
 benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh |  1 +
 benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh |  1 +
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index dadea4728..a64f85c75 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -220,8 +220,9 @@ fi
 # =============================================================================
 IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"
 
-PREFILL_NODES_PER_WORKER=$((PREFILL_TP_SIZE / GPUS_PER_NODE))
-DECODE_NODES_PER_WORKER=$((DECODE_TP_SIZE / GPUS_PER_NODE))
+# Ceiling division by GPUS_PER_NODE for nodes-per-worker
+PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE))
+DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE))
 NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP))
 
 # Build prefill arguments dynamically based on xP
diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh
index a2c3622b9..44bd130f3 100755
--- a/benchmarks/multi_node/amd_utils/submit.sh
+++ b/benchmarks/multi_node/amd_utils/submit.sh
@@ -70,8 +70,10 @@ PREFILL_ENABLE_EP=${9:-1}
 PREFILL_ENABLE_DP=${10:-1}
 DECODE_ENABLE_EP=${11:-1}
 DECODE_ENABLE_DP=${12:-1}
-RANDOM_RANGE_RATIO=${13}
-NODE_LIST=${14}
+PREFILL_TP=${13:-8}
+DECODE_TP=${14:-8}
+RANDOM_RANGE_RATIO=${15}
+NODE_LIST=${16}
 
 
 NUM_NODES=$((PREFILL_NODES + DECODE_NODES))
@@ -89,10 +91,10 @@ export yD=$DECODE_WORKERS
 export NUM_NODES=$NUM_NODES
 export GPUS_PER_NODE=$GPUS_PER_NODE
 export MODEL_NAME=$MODEL_NAME
-export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $GPUS_PER_NODE / $PREFILL_WORKERS ))
+export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS ))
 export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP}
 export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP}
-export DECODE_TP_SIZE=$(( $DECODE_NODES * $GPUS_PER_NODE / $DECODE_WORKERS ))
+export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS ))
 export DECODE_ENABLE_EP=${DECODE_ENABLE_EP}
 export DECODE_ENABLE_DP=${DECODE_ENABLE_DP}
 export DECODE_MTP_SIZE=${DECODE_MTP_SIZE}
diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
index f568b2b5d..6a7314ab4 100644
--- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
+++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
@@ -71,6 +71,7 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
     $ISL $OSL "${CONC_LIST// /x}" inf \
     ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
     ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
     ${RANDOM_RANGE_RATIO})
 
 if [[ $? -ne 0 ]]; then
diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
index 9a0ea92e7..0124d4b4d 100644
--- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
+++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
@@ -71,6 +71,7 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
     $ISL $OSL "${CONC_LIST// /x}" inf \
     ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
     ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
     ${RANDOM_RANGE_RATIO})
 
 if [[ $? -ne 0 ]]; then

From 7f664ce905f7d09c9ec850c3a083311389bd687e Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sun, 1 Mar 2026 14:31:52 +0000
Subject: [PATCH 12/17] fix fp4 configs

---
 .github/configs/amd-master.yaml | 82 ++++++++++++++++++++++++---------
 1 file changed, 60 insertions(+), 22 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index fb562fbc7..5f40e62a9 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -464,7 +464,7 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
     - spec-decoding: "none"
-      conc-list: [ 2, 1 ]
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
         tp: 4
@@ -526,7 +526,7 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
     - spec-decoding: "none"
-      conc-list: [ 128, 2, 1 ]
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
         tp: 4
@@ -619,7 +619,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
     - spec-decoding: "mtp"
-      conc-list: [ 2, 1 ]
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
         tp: 4
@@ -681,7 +681,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
     - spec-decoding: "mtp"
-      conc-list: [ 4, 2, 1 ]
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
         tp: 4
@@ -858,7 +858,7 @@ dsr1-fp4-mi355x-sglang-disagg:
 
     # 1P2D TP8
     - spec-decoding: "none"
-      conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ]
+      conc-list: [ 2, 4, 8, 16, 32 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -875,20 +875,39 @@ dsr1-fp4-mi355x-sglang-disagg:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
 
-    # 1P2D DEP8+TEP8
-    - spec-decoding: "none"
-      conc-list: [ 256, 512 ]
+    # 1P2D TP8
+    - spec-decoding: "mtp" 
+      conc-list: [ 64, 128, 256 ]
       prefill:
         num-worker: 1
         tp: 8
-        ep: 8
-        dp-attn: true
+        ep: 1
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
         tp: 8
-        ep: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP4
+    - spec-decoding: "mtp" 
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
@@ -918,8 +937,8 @@ dsr1-fp4-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
     # 1P2D TP8
-    - spec-decoding: "none"
-      conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ]
+    - spec-decoding: "mtp"
+      conc-list: [ 2, 4, 8, 16, 32 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -936,23 +955,42 @@ dsr1-fp4-mi355x-sglang-disagg:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
 
-    # 2P1D DEP8+TEP8
-    - spec-decoding: "none"
-      conc-list: [ 256, 512 ]
+    # 1P2D TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 128, 256 ]
       prefill:
-        num-worker: 2
+        num-worker: 1
         tp: 8
-        ep: 8
-        dp-attn: true
+        ep: 1
+        dp-attn: false
         additional-settings:
-        - "PREFILL_NODES=2"
+        - "PREFILL_NODES=1"
       decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP4
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 128, 256 ]
+      prefill:
         num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
-        - "DECODE_NODES=1"
+        - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:

From 02a9806f030d12db61e1c0319364914ffa95c3d0 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sun, 1 Mar 2026 15:38:10 +0000
Subject: [PATCH 13/17] fix

---
 .github/configs/amd-master.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 5f40e62a9..3c277cfec 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -876,7 +876,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
     # 1P2D TP8
-    - spec-decoding: "mtp" 
+    - spec-decoding: "none" 
       conc-list: [ 64, 128, 256 ]
       prefill:
         num-worker: 1
@@ -895,7 +895,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
     # 1P2D TP4
-    - spec-decoding: "mtp" 
+    - spec-decoding: "none" 
       conc-list: [ 64, 128, 256 ]
       prefill:
         num-worker: 1
@@ -937,7 +937,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
     # 1P2D TP8
-    - spec-decoding: "mtp"
+    - spec-decoding: "none"
       conc-list: [ 2, 4, 8, 16, 32 ]
       prefill:
         num-worker: 1
@@ -956,7 +956,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
     # 1P2D TP8
-    - spec-decoding: "mtp"
+    - spec-decoding: "none"
       conc-list: [ 64, 128, 256 ]
       prefill:
         num-worker: 1
@@ -975,7 +975,7 @@ dsr1-fp4-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
     # 1P2D TP4
-    - spec-decoding: "mtp"
+    - spec-decoding: "none"
       conc-list: [ 64, 128, 256 ]
       prefill:
         num-worker: 1

From e95f94ba7ac9f483b2a7c600fe6138c650cca9d6 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 2 Mar 2026 14:00:18 +0000
Subject: [PATCH 14/17] fix ep16 config

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index c2b5f4275..f99bfcf10 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -492,7 +492,7 @@ dsr1-fp8-mi355x-sglang-disagg:
         ep: 16
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=2"
+        - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
     # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8)
@@ -647,7 +647,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         ep: 16
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=2"
+        - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
     # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8)

From 0d7699d09cbb883479f1945315caff45b461e090 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 2 Mar 2026 14:25:02 +0000
Subject: [PATCH 15/17] fix 1k1k config

---
 .github/configs/amd-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index f99bfcf10..8406045ad 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -652,7 +652,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
 
     # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8)
     - spec-decoding: "mtp"
-      conc-list: [ 1536, 1024, 512, 256 ]
+      conc-list: [ 1536, 1024, 512 ]
       prefill:
         num-worker: 1
         tp: 8

From 57fe4408f3676a1b697a1334161eec4501aa63fc Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 2 Mar 2026 16:22:14 +0000
Subject: [PATCH 16/17] fix ep16 config

---
 .github/configs/amd-master.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 8406045ad..2ebe168b5 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -488,11 +488,11 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 1
-        tp: 16
-        ep: 16
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=1"
+        - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
 
     # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8)
@@ -643,11 +643,11 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 1
-        tp: 16
-        ep: 16
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=1"
+        - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=1"
 
     # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8)

From 971e78d2e5647f5cd4cc245c53a5edf84db1548a Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 3 Mar 2026 02:49:27 +0000
Subject: [PATCH 17/17] fix fp8 config

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 2ebe168b5..537bc48a2 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -497,7 +497,7 @@ dsr1-fp8-mi355x-sglang-disagg:
 
     # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8)
     - spec-decoding: "none"
-      conc-list: [ 1536, 1024, 512, 256 ]
+      conc-list: [ 1536, 1024, 512 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -652,7 +652,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
 
     # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8)
     - spec-decoding: "mtp"
-      conc-list: [ 1536, 1024, 512 ]
+      conc-list: [ 1536, 1024, 512, 256 ]
       prefill:
         num-worker: 1
         tp: 8