diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index b316bcede..00fd01936 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -551,7 +551,7 @@ dsr1-fp8-mi355x-atom-mtp:
     - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp  }
 
 dsr1-fp8-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.8-rocm700-mi35x-mori-0210
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -563,9 +563,9 @@ dsr1-fp8-mi355x-sglang-disagg:
   - isl: 1024
     osl: 1024
     search-space:
-    # MTP configurations
-    # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16)
-    - spec-decoding: "mtp"
+    # non-MTP configurations
+    # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16)
+    - spec-decoding: "none"
       conc-list: [ 1024, 2048 ]
       prefill:
         num-worker: 1
@@ -576,16 +576,16 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 1
-        tp: 1
-        ep: 16
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
+        - "DECODE_MTP_SIZE=0"
 
-    # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8)
-    - spec-decoding: "mtp"
-      conc-list: [ 1536, 1024, 512, 256 ]
+    # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8)
+    - spec-decoding: "none"
+      conc-list: [ 1536, 1024, 512 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -595,17 +595,17 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
-        tp: 1
+        tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
+        - "DECODE_MTP_SIZE=0"
 
 
     # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-    - spec-decoding: "mtp"
-      conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
+    - spec-decoding: "none"
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -621,13 +621,13 @@ dsr1-fp8-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=2"
+        - "DECODE_MTP_SIZE=0"
 
-    - spec-decoding: "mtp"
-      conc-list: [ 4, 2, 1 ]
+    - spec-decoding: "none"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
-        tp: 8
+        tp: 4
         ep: 1
         dp-attn: false
         additional-settings:
@@ -640,51 +640,34 @@ dsr1-fp8-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=2"
+        - "DECODE_MTP_SIZE=0"
 
+  - isl: 8192
+    osl: 1024
+    search-space:
     # non-MTP configurations
-    # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16)
+    # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8)
     - spec-decoding: "none"
       conc-list: [ 1024, 2048 ]
       prefill:
-        num-worker: 1
+        num-worker: 2
         tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 1
-        ep: 16
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-    - spec-decoding: "none"
-      conc-list: [ 1536, 1024, 512, 256 ]
-      prefill:
+        - "PREFILL_NODES=2"
+      decode:
         num-worker: 1
         tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 1
         ep: 8
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=2"
+        - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
-
-    # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
+    # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8)
     - spec-decoding: "none"
-      conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -703,10 +686,10 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=0"
 
     - spec-decoding: "none"
-      conc-list: [ 4, 2, 1 ]
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
-        tp: 8
+        tp: 4
         ep: 1
         dp-attn: false
         additional-settings:
@@ -721,78 +704,53 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
-  - isl: 8192
+
+dsr1-fp8-mi355x-sglang-disagg-mtp:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
+  model: deepseek-ai/DeepSeek-R1-0528
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp8
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
     osl: 1024
     search-space:
     # MTP configurations
+    # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16)
     - spec-decoding: "mtp"
       conc-list: [ 1024, 2048 ]
       prefill:
-        num-worker: 2
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
         num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024 ]
-      prefill:
-        num-worker: 2
         tp: 8
         ep: 1
         dp-attn: false
         additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-    # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16)
-    - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048 ]
-      prefill:
-        num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 1
-        tp: 1
-        ep: 16
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=1"
 
-    # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8)
+    # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8)
     - spec-decoding: "mtp"
-      conc-list: [ 1536, 1024 ]
+      conc-list: [ 1536, 1024, 512, 256 ]
       prefill:
         num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
+        tp: 8
+        ep: 1
+        dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
-        tp: 1
+        tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
@@ -802,7 +760,7 @@ dsr1-fp8-mi355x-sglang-disagg:
 
     # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
     - spec-decoding: "mtp"
-      conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
+      conc-list: [ 256, 128, 64, 32, 16, 8, 4 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -821,10 +779,10 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_MTP_SIZE=2"
 
     - spec-decoding: "mtp"
-      conc-list: [ 4, 2, 1 ]
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
-        tp: 8
+        tp: 4
         ep: 1
         dp-attn: false
         additional-settings:
@@ -839,83 +797,31 @@ dsr1-fp8-mi355x-sglang-disagg:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=2"
 
-    # non-MTP configurations
-    - spec-decoding: "none"
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8)
+    - spec-decoding: "mtp"
       conc-list: [ 1024, 2048 ]
       prefill:
         num-worker: 2
-        tp: 1
+        tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=2"
       decode:
         num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024 ]
-      prefill:
-        num-worker: 2
         tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 1
         ep: 8
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16)
-    - spec-decoding: "none"
-      conc-list: [ 1024, 2048 ]
-      prefill:
-        num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 1
-        ep: 16
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-    - spec-decoding: "none"
-      conc-list: [ 1536, 1024 ]
-      prefill:
-        num-worker: 1
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 1
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
+        - "DECODE_MTP_SIZE=1"
 
-    # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-    - spec-decoding: "none"
+    # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8)
+    - spec-decoding: "mtp"
       conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ]
       prefill:
         num-worker: 1
@@ -932,13 +838,13 @@ dsr1-fp8-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
+        - "DECODE_MTP_SIZE=2"
 
-    - spec-decoding: "none"
-      conc-list: [ 4, 2, 1 ]
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 32, 16, 8, 4, 2, 1 ]
       prefill:
         num-worker: 1
-        tp: 8
+        tp: 4
         ep: 1
         dp-attn: false
         additional-settings:
@@ -951,7 +857,7 @@ dsr1-fp8-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
+        - "DECODE_MTP_SIZE=2"
 
   # FIXME(billishyahao): disable 1k8k for now
   # - isl: 1024
@@ -1078,7 +984,7 @@ dsr1-fp8-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.7-rocm700-mi35x-mori-fp4-0122
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
   model: amd/DeepSeek-R1-0528-MXFP4
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1090,48 +996,48 @@ dsr1-fp4-mi355x-sglang-disagg:
   - isl: 1024
     osl: 1024
     search-space:
-    # MTP configurations
-    # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-    - spec-decoding: "mtp"
-      conc-list: [ 1024, 512, 256 ]
+    # non-MTP configurations
+    # 1P1D TP8
+    - spec-decoding: "none"
+      conc-list: [ 1, 2, 4, 8 ]
       prefill:
         num-worker: 1
-        tp: 1
-        ep: 8
+        tp: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
-        num-worker: 2
-        tp: 1
-        ep: 8
+        num-worker: 1
+        tp: 8
+        ep: 1
         dp-attn: false
         additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
 
-    # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-    - spec-decoding: "mtp"
-      conc-list: [ 32, 64, 128 ]
+    # 1P2D TP8
+    - spec-decoding: "none"
+      conc-list: [ 2, 4, 8, 16, 32 ]
       prefill:
         num-worker: 1
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
-
       decode:
         num-worker: 2
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
+        - "DECODE_MTP_SIZE=0"
 
-    - spec-decoding: "mtp"
-      conc-list: [ 128, 64, 32, 16, 8, 4 ]
+    # 1P2D TP8
+    - spec-decoding: "none" 
+      conc-list: [ 64, 128, 256 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -1139,7 +1045,6 @@ dsr1-fp4-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
-
       decode:
         num-worker: 2
         tp: 8
@@ -1147,50 +1052,72 @@ dsr1-fp4-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=2"
+        - "DECODE_MTP_SIZE=0"
 
-    # non-MTP configurations
-    # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-    - spec-decoding: "none"
-      conc-list: [ 1024, 512, 256 ]
+    # 1P2D TP4
+    - spec-decoding: "none" 
+      conc-list: [ 64, 128, 256 ]
       prefill:
         num-worker: 1
-        tp: 1
-        ep: 8
+        tp: 4
+        ep: 1
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
-        tp: 1
-        ep: 8
+        tp: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
 
-    # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 1P1D pure TP8
     - spec-decoding: "none"
-      conc-list: [ 32, 64, 128 ]
+      conc-list: [ 1, 2, 4, 8 ]
       prefill:
         num-worker: 1
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
 
+    # 1P2D TP8
+    - spec-decoding: "none"
+      conc-list: [ 2, 4, 8, 16, 32 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
       decode:
         num-worker: 2
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
 
+    # 1P2D TP8
     - spec-decoding: "none"
-      conc-list: [ 128, 64, 32, 16, 8, 4 ]
+      conc-list: [ 64, 128, 256 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -1198,7 +1125,25 @@ dsr1-fp4-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
 
+    # 1P2D TP4
+    - spec-decoding: "none"
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
       decode:
         num-worker: 2
         tp: 8
@@ -1208,51 +1153,61 @@ dsr1-fp4-mi355x-sglang-disagg:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
 
-  - isl: 8192
+dsr1-fp4-mi355x-sglang-disagg-mtp:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
+  model: amd/DeepSeek-R1-0528-MXFP4
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
     osl: 1024
     search-space:
     # MTP configurations
-    # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
+    # 1P1D TP8
     - spec-decoding: "mtp"
-      conc-list: [ 1024, 512, 256 ]
+      conc-list: [ 1, 2, 4, 8 ]
       prefill:
         num-worker: 1
-        tp: 1
-        ep: 8
+        tp: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
-        num-worker: 2
-        tp: 1
-        ep: 8
+        num-worker: 1
+        tp: 8
+        ep: 1
         dp-attn: false
         additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
 
-    # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-    - spec-decoding: "mtp"
-      conc-list: [ 32, 64, 128 ]
+    # 1P2D TP8
+    - spec-decoding: "mtp" 
+      conc-list: [ 2, 4, 8, 16, 32 ]
       prefill:
         num-worker: 1
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
-
       decode:
         num-worker: 2
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
+        - "DECODE_MTP_SIZE=3"
 
-    - spec-decoding: "mtp"
-      conc-list: [ 128, 64, 32, 16, 8, 4 ]
+    # 1P2D TP8
+    - spec-decoding: "mtp" 
+      conc-list: [ 64, 128, 256 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -1260,7 +1215,6 @@ dsr1-fp4-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
-
       decode:
         num-worker: 2
         tp: 8
@@ -1268,50 +1222,74 @@ dsr1-fp4-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=2"
+        - "DECODE_MTP_SIZE=1"
 
-    # non-MTP configurations
-    # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-    - spec-decoding: "none"
-      conc-list: [ 1024, 512, 256 ]
+    # 1P2D TP4
+    - spec-decoding: "mtp" 
+      conc-list: [ 64, 128, 256 ]
       prefill:
         num-worker: 1
-        tp: 1
-        ep: 8
+        tp: 4
+        ep: 1
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
         num-worker: 2
-        tp: 1
-        ep: 8
+        tp: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
+        - "DECODE_MTP_SIZE=1"
 
-    # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-    - spec-decoding: "none"
-      conc-list: [ 32, 64, 128 ]
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 1P1D pure TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1, 2, 4, 8 ]
       prefill:
         num-worker: 1
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
 
+
+    # 1P2D TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 2, 4, 8, 16, 32 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
       decode:
         num-worker: 2
         tp: 8
-        ep: 8
+        ep: 1
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
+        - "DECODE_MTP_SIZE=3"
 
-    - spec-decoding: "none"
-      conc-list: [ 128, 64, 32, 16, 8, 4 ]
+    # 1P2D TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 128, 256 ]
       prefill:
         num-worker: 1
         tp: 8
@@ -1319,7 +1297,25 @@ dsr1-fp4-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
 
+    # 1P2D TP4
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
       decode:
         num-worker: 2
         tp: 8
@@ -1327,7 +1323,8 @@ dsr1-fp4-mi355x-sglang-disagg:
         dp-attn: false
         additional-settings:
         - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
+        - "DECODE_MTP_SIZE=1"
+
 
   # FIXME(billishyahao): disable FP4 1k8k for now
   # - isl: 1024
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index edfebc755..5280b6767 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -40,7 +40,7 @@
 DeepSeek-V3:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true
@@ -71,7 +71,7 @@ DeepSeek-V3:
 DeepSeek-V3-0324:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true
@@ -102,7 +102,38 @@ DeepSeek-V3-0324:
 DeepSeek-R1:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+
+DeepSeek-R1-0528:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true
@@ -133,7 +164,38 @@ DeepSeek-R1:
 DeepSeek-R1-0528-MXFP4-Preview:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: 16384
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 16384
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+
+DeepSeek-R1-0528-MXFP4:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter"
+  mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index dadea4728..a64f85c75 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -220,8 +220,9 @@ fi
 # =============================================================================
 IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"
 
-PREFILL_NODES_PER_WORKER=$((PREFILL_TP_SIZE / GPUS_PER_NODE))
-DECODE_NODES_PER_WORKER=$((DECODE_TP_SIZE / GPUS_PER_NODE))
+# Ceiling division by GPUS_PER_NODE for nodes-per-worker
+PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE))
+DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE))
 NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP))
 
 # Build prefill arguments dynamically based on xP
diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh
index a2c3622b9..44bd130f3 100755
--- a/benchmarks/multi_node/amd_utils/submit.sh
+++ b/benchmarks/multi_node/amd_utils/submit.sh
@@ -70,8 +70,10 @@ PREFILL_ENABLE_EP=${9:-1}
 PREFILL_ENABLE_DP=${10:-1}
 DECODE_ENABLE_EP=${11:-1}
 DECODE_ENABLE_DP=${12:-1}
-RANDOM_RANGE_RATIO=${13}
-NODE_LIST=${14}
+PREFILL_TP=${13:-8}
+DECODE_TP=${14:-8}
+RANDOM_RANGE_RATIO=${15}
+NODE_LIST=${16}
 
 
 NUM_NODES=$((PREFILL_NODES + DECODE_NODES))
@@ -89,10 +91,10 @@ export yD=$DECODE_WORKERS
 export NUM_NODES=$NUM_NODES
 export GPUS_PER_NODE=$GPUS_PER_NODE
 export MODEL_NAME=$MODEL_NAME
-export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $GPUS_PER_NODE / $PREFILL_WORKERS ))
+export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS ))
 export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP}
 export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP}
-export DECODE_TP_SIZE=$(( $DECODE_NODES * $GPUS_PER_NODE / $DECODE_WORKERS ))
+export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS ))
 export DECODE_ENABLE_EP=${DECODE_ENABLE_EP}
 export DECODE_ENABLE_DP=${DECODE_ENABLE_DP}
 export DECODE_MTP_SIZE=${DECODE_MTP_SIZE}
diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
index aaaf97075..6a7314ab4 100644
--- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
+++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
@@ -33,17 +33,27 @@ cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
 # Set up SGL launch script-specific environment variables
 export TIME_LIMIT="08:00:00"
 export MODEL_PATH=$MODEL_PATH
-export MODEL_NAME="DeepSeek-R1-0528-MXFP4-Preview"
+export MODEL_NAME=$MODEL_NAME
 export CONTAINER_IMAGE=$IMAGE
 
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
 export PREFILL_ENABLE_EP=true
+fi
+
 if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
 export PREFILL_ENABLE_DP=true
 else
 export PREFILL_ENABLE_DP=false
 fi
 
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
 export DECODE_ENABLE_EP=true
+fi
+
 if [[ "$DECODE_DP_ATTN" == "true" ]]; then
 export DECODE_ENABLE_DP=true
 else
@@ -61,6 +71,7 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
     $ISL $OSL "${CONC_LIST// /x}" inf \
     ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
     ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
     ${RANDOM_RANGE_RATIO})
 
 if [[ $? -ne 0 ]]; then
diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
index 0a5a5c30b..0124d4b4d 100644
--- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
+++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
@@ -33,7 +33,7 @@ cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
 # Set up SGL launch script-specific environment variables
 export TIME_LIMIT="08:00:00"
 export MODEL_PATH=$MODEL_PATH
-export MODEL_NAME="DeepSeek-R1"
+export MODEL_NAME=$MODEL_NAME
 export CONTAINER_IMAGE=$IMAGE
 
 if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
@@ -71,6 +71,7 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
     $ISL $OSL "${CONC_LIST// /x}" inf \
     ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
     ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
     ${RANDOM_RANGE_RATIO})
 
 if [[ $? -ne 0 ]]; then
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index a48f42b6b..05b5dd5ce 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -835,6 +835,18 @@
     - "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/839
 
+- config-keys:
+    - dsr1-fp8-mi355x-sglang-disagg
+    - dsr1-fp8-mi355x-sglang-disagg-mtp
+    - dsr1-fp4-mi355x-sglang-disagg
+    - dsr1-fp4-mi355x-sglang-disagg-mtp
+  description:
+    - "Add more sweep configs for MI355X FP8/FP4 Disagg"
+    - "Add TP/DP/EP size < 8 support "
+    - "Support DSR1-0528 MTP Disagg"
+    - "Bump SGL mori image to Feb 27"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823
+
 - config-keys:
     - minimaxm2.5-fp8-h100-vllm
   description:
diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
index 2b9902b0b..f4f1e561f 100644
--- a/runners/launch_mi355x-amds.sh
+++ b/runners/launch_mi355x-amds.sh
@@ -34,7 +34,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     export SLURM_PARTITION="compute"
     export SLURM_JOB_NAME="benchmark-sglang-disagg.job"
 
-    export MODEL_NAME="DeepSeek-R1"
+    export MODEL_NAME=${MODEL##*/}
     export MODEL_PATH="/it-share/data"
     export IBDEVICES="rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7"
     export MORI_RDMA_TC=104