From 0383696f7c6173378dee1ab115b4151f51f47ccf Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 16 Mar 2026 08:36:19 +0000
Subject: [PATCH 01/55] [AMD] add dsr1 mxfp4 v2 sweep points

---
 .github/configs/amd-master.yaml             | 56 +++++++++++++++++++++
 benchmarks/multi_node/amd_utils/models.yaml | 31 ++++++++++++
 2 files changed, 87 insertions(+)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 5551860f2..61c842f58 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1446,6 +1446,62 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 1P1D pure TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1, 2, 4, 8 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
+
+
+    # 1P2D TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 2, 4, 8, 16, 32 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=3"
+
+    
+
+
 
   # FIXME(billishyahao): disable FP4 1k8k for now
   # - isl: 1024
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 2bbdd91d6..4c6611571 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -222,3 +222,34 @@ DeepSeek-R1-0528-MXFP4:
       max_running_requests: 128
       chunked_prefill_size: 262144
       cuda_graph_bs_range: "1-128"
+
+DeepSeek-R1-0528-MXFP4-v2:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 16384
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"

From 18e05b1cbb097497a63800291b6015e8cd37e250 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 17 Mar 2026 06:36:04 +0000
Subject: [PATCH 02/55] fix

---
 .github/configs/amd-master.yaml             |  3 ---
 benchmarks/multi_node/amd_utils/models.yaml | 14 +++++++-------
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 61c842f58..f20ed38fd 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1499,9 +1499,6 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=3"
 
-    
-
-
 
   # FIXME(billishyahao): disable FP4 1k8k for now
   # - isl: 1024
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 4c6611571..07668659d 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -38,7 +38,7 @@
 #         cuda_graph_bs_range: str
 
 DeepSeek-V3:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -69,7 +69,7 @@ DeepSeek-V3:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-V3-0324:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -100,7 +100,7 @@ DeepSeek-V3-0324:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -131,7 +131,7 @@ DeepSeek-R1:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -162,7 +162,7 @@ DeepSeek-R1-0528:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4-Preview:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -224,7 +224,7 @@ DeepSeek-R1-0528-MXFP4:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4-v2:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:

From 32b5d3d00cce991eb9e7a3b298c69ee7b9cf28cd Mon Sep 17 00:00:00 2001
From: Zhai Feiyue <feiyue.zhai@amd.com>
Date: Tue, 24 Mar 2026 14:59:35 +0000
Subject: [PATCH 03/55] Fix tokenizer mismatch between benchmark client and
 sglang server on transformers v5

Transformers v5 incorrectly rebuilds pre_tokenizer/decoder components for
models like DeepSeek-R1 that use LlamaTokenizerFast with a non-Llama
tokenizer architecture. The sglang server fixes this at startup, but the
benchmark client loads the tokenizer without these fixes, causing a ~5x
token count inflation (e.g. 7000 tokens -> 35000 tokens) and false
performance regressions in TTFT and throughput benchmarks.

Apply the same tokenizer fixes (pre_tokenizer/decoder restoration and
add_bos_token recovery) that sglang server applies, so client and server
tokenize identically. No-op on transformers v4.

Made-with: Cursor
---
 utils/bench_serving/backend_request_func.py | 72 ++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

diff --git a/utils/bench_serving/backend_request_func.py b/utils/bench_serving/backend_request_func.py
index 32331a398..4990ef5fa 100644
--- a/utils/bench_serving/backend_request_func.py
+++ b/utils/bench_serving/backend_request_func.py
@@ -439,6 +439,75 @@ def get_model(pretrained_model_name_or_path: str) -> str:
     return pretrained_model_name_or_path
 
 
+def _fix_tokenizer_for_sglang(tokenizer, model_path):
+    """Fix transformers v5 tokenizer to match sglang server-side behavior.
+
+    Root cause: transformers v5 (>= 5.0) changed how tokenizers are loaded.
+    Specifically, LlamaTokenizerFast.__init__ in v5 rebuilds the pre_tokenizer
+    and decoder from scratch using class-specific components, discarding the
+    originals from tokenizer.json. For models like DeepSeek-R1 that declare
+    LlamaTokenizerFast but actually use a ByteLevel/Sequence tokenizer
+    architecture, v5 incorrectly replaces the original Sequence pre_tokenizer
+    with Metaspace, and the original ByteLevel decoder with Sequence.
+
+    The sglang server applies fixes for this in hf_transformers_utils.py
+    (_fix_v5_tokenizer_components and _fix_v5_add_bos_eos_token), but the
+    benchmark client loads the tokenizer directly via AutoTokenizer without
+    these fixes. This mismatch causes the client to encode text differently
+    from the server -- e.g. a 7000-token prompt on the client becomes ~35000
+    tokens on the server, leading to ~5x TTFT inflation and false performance
+    regressions in benchmarks.
+
+    This function replicates the same fixes so the benchmark client tokenizes
+    identically to the sglang server. It is a no-op on transformers v4.
+    """
+    import json
+    from pathlib import Path
+
+    backend = getattr(tokenizer, "_tokenizer", None)
+    if backend is not None:
+        try:
+            from tokenizers import Tokenizer as RawTokenizer
+            tok_file = Path(model_path) / "tokenizer.json"
+            if tok_file.is_file():
+                raw = RawTokenizer.from_file(str(tok_file))
+                raw_pre = type(raw.pre_tokenizer).__name__ if raw.pre_tokenizer else None
+                loaded_pre = type(backend.pre_tokenizer).__name__ if backend.pre_tokenizer else None
+                if raw_pre and loaded_pre and raw_pre != loaded_pre:
+                    backend.pre_tokenizer = raw.pre_tokenizer
+                    backend.decoder = raw.decoder
+        except Exception:
+            pass
+
+    try:
+        config_file = Path(model_path) / "tokenizer_config.json"
+        if config_file.is_file():
+            with open(config_file) as f:
+                config = json.load(f)
+            tok_class = config.get("tokenizer_class", "")
+            bos_eos_classes = {
+                "LlamaTokenizer", "LlamaTokenizerFast",
+                "CodeLlamaTokenizer", "CodeLlamaTokenizerFast",
+                "GemmaTokenizer", "GemmaTokenizerFast", "CohereTokenizerFast",
+            }
+            if tok_class in bos_eos_classes:
+                defaults = {"add_bos_token": True, "add_eos_token": False}
+                changed = False
+                for attr in ("add_bos_token", "add_eos_token"):
+                    val = config.get(attr)
+                    if val is None:
+                        val = defaults.get(attr, False)
+                    if getattr(tokenizer, attr, None) != val:
+                        setattr(tokenizer, f"_{attr}", val)
+                        changed = True
+                if changed and hasattr(tokenizer, "update_post_processor"):
+                    tokenizer.update_post_processor()
+    except Exception:
+        pass
+
+    return tokenizer
+
+
 def get_tokenizer(
     pretrained_model_name_or_path: str,
     tokenizer_mode: str = "auto",
@@ -464,11 +533,12 @@ def get_tokenizer(
         return MistralTokenizer.from_pretrained(
             str(pretrained_model_name_or_path))
     else:
-        return AutoTokenizer.from_pretrained(
+        tokenizer = AutoTokenizer.from_pretrained(
             pretrained_model_name_or_path,
             trust_remote_code=trust_remote_code,
             **kwargs,
         )
+        return _fix_tokenizer_for_sglang(tokenizer, pretrained_model_name_or_path)
 
 
 ASYNC_REQUEST_FUNCS = {

From 0bd347fe71d6689269c81569b797985618ffad7f Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Wed, 25 Mar 2026 15:28:03 +0000
Subject: [PATCH 04/55] change mtp model to fp8

---
 .github/configs/amd-master.yaml             | 369 +++++++++++++++++++-
 benchmarks/multi_node/amd_utils/models.yaml |   2 +-
 2 files changed, 369 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index f20ed38fd..525595b7b 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1446,8 +1446,218 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+
+dsr1-fp4-mi355x-sglang-disagg-v2:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 1P1D TP8
+    - spec-decoding: "none"
+      conc-list: [ 1, 2, 4, 8 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP8
+    - spec-decoding: "none"
+      conc-list: [ 2, 4, 8, 16, 32 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP8
+    - spec-decoding: "none" 
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP4
+    - spec-decoding: "none" 
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+    
+    # 1*DEP4+ 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 1024, 2048 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 1P1D pure TP8
+    - spec-decoding: "none"
+      conc-list: [ 1, 2, 4, 8 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP8
+    - spec-decoding: "none"
+      conc-list: [ 2, 4, 8, 16, 32 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP8
+    - spec-decoding: "none"
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1P2D TP4
+    - spec-decoding: "none"
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 1024, 2048, 4096 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+
 dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0313-2
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1456,6 +1666,106 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
   multinode: true
   disagg: true
   seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 1P1D TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1, 2, 4, 8 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=3"
+
+    # 1P2D TP8
+    - spec-decoding: "mtp" 
+      conc-list: [ 2, 4, 8, 16, 32 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=3"
+
+    # 1P2D TP8
+    - spec-decoding: "mtp" 
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
+
+    # 1P2D TP4
+    - spec-decoding: "mtp" 
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
+
+    # 1*DEP4+ 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1024, 2048 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
+
   - isl: 8192
     osl: 1024
     search-space:
@@ -1499,6 +1809,63 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=3"
 
+    # 1P2D TP8
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
+
+    # 1P2D TP4
+    - spec-decoding: "mtp"
+      conc-list: [ 64, 128, 256 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 2
+        tp: 8
+        ep: 1
+        dp-attn: false
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
+
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1024, 2048, 4096 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
 
   # FIXME(billishyahao): disable FP4 1k8k for now
   # - isl: 1024
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 07668659d..6bca6b52a 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -225,7 +225,7 @@ DeepSeek-R1-0528-MXFP4:
 
 DeepSeek-R1-0528-MXFP4-v2:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
-  mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
+  mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8

From 754e53c00fd834dcc6093c8b164966b8019b0605 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Wed, 25 Mar 2026 15:32:42 +0000
Subject: [PATCH 05/55] change fp8 image

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 525595b7b..2cea84d01 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -596,7 +596,7 @@ dsr1-fp8-mi355x-atom-mtp:
     - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp  }
 
 dsr1-fp8-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -751,7 +751,7 @@ dsr1-fp8-mi355x-sglang-disagg:
 
 
 dsr1-fp8-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi355x-disagg

From f29f2d01ea990161dfc6bc79401a30a30bda9502 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 27 Mar 2026 11:20:18 +0000
Subject: [PATCH 06/55] bump image to 0327

---
 .github/configs/amd-master.yaml             | 4 ++--
 benchmarks/multi_node/amd_utils/env.sh      | 4 +++-
 benchmarks/multi_node/amd_utils/models.yaml | 2 +-
 3 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 2cea84d01..a0112d479 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1448,7 +1448,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
 
 
 dsr1-fp4-mi355x-sglang-disagg-v2:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1657,7 +1657,7 @@ dsr1-fp4-mi355x-sglang-disagg-v2:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 5565c5b3b..f4b631673 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -34,7 +34,6 @@ export IBDEVICES
 export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
 export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
 
-set +x
 
 export NCCL_IB_HCA=$IBDEVICES
 
@@ -123,4 +122,7 @@ fi
 # FIXME: WA for latest upstream 0305 image
 export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}
 
+export SGLANG_ENABLE_SPEC_V2=1
+export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
 
+set +x
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 6bca6b52a..eed59bdab 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -225,7 +225,7 @@ DeepSeek-R1-0528-MXFP4:
 
 DeepSeek-R1-0528-MXFP4-v2:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
-  mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1"
+  mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8

From a44c7eb8759e8527d8af192e5aa2ffc7f7e65fb0 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 27 Mar 2026 14:09:11 +0000
Subject: [PATCH 07/55] remove specv2

---
 benchmarks/multi_node/amd_utils/env.sh | 2 --
 1 file changed, 2 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index f4b631673..02cb77a91 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -122,7 +122,5 @@ fi
 # FIXME: WA for latest upstream 0305 image
 export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}
 
-export SGLANG_ENABLE_SPEC_V2=1
-export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
 
 set +x

From 25141364c930fc59e455aeca97eeeebd81e750fa Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 30 Mar 2026 01:57:19 +0000
Subject: [PATCH 08/55] consolidate dsr1 fp4 configs

---
 .github/configs/amd-master.yaml | 422 +-------------------------------
 1 file changed, 1 insertion(+), 421 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index a0112d479..6a96a4af2 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1027,427 +1027,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
   #       - "DECODE_NODES=2"
   #       - "DECODE_MTP_SIZE=0"
 
-
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
-  model: amd/DeepSeek-R1-0528-MXFP4
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    # non-MTP configurations
-    # 1P1D TP8
-    - spec-decoding: "none"
-      conc-list: [ 1, 2, 4, 8 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    # 1P2D TP8
-    - spec-decoding: "none"
-      conc-list: [ 2, 4, 8, 16, 32 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # 1P2D TP8
-    - spec-decoding: "none" 
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # 1P2D TP4
-    - spec-decoding: "none" 
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-    
-    # 1*DEP4+ 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 1024, 2048 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # non-MTP configurations
-    # 1P1D pure TP8
-    - spec-decoding: "none"
-      conc-list: [ 1, 2, 4, 8 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    # 1P2D TP8
-    - spec-decoding: "none"
-      conc-list: [ 2, 4, 8, 16, 32 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # 1P2D TP8
-    - spec-decoding: "none"
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # 1P2D TP4
-    - spec-decoding: "none"
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # 4*DEP4 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 1024, 2048, 4096 ]
-      prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
-  model: amd/DeepSeek-R1-0528-MXFP4
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    # MTP configurations
-    # 1P1D TP8
-    - spec-decoding: "mtp"
-      conc-list: [ 1, 2, 4, 8 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=3"
-
-    # 1P2D TP8
-    - spec-decoding: "mtp" 
-      conc-list: [ 2, 4, 8, 16, 32 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=3"
-
-    # 1P2D TP8
-    - spec-decoding: "mtp" 
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
-
-    # 1P2D TP4
-    - spec-decoding: "mtp" 
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
-
-    # 1*DEP4+ 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # MTP configurations
-    # 1P1D pure TP8
-    - spec-decoding: "mtp"
-      conc-list: [ 1, 2, 4, 8 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=3"
-
-
-    # 1P2D TP8
-    - spec-decoding: "mtp"
-      conc-list: [ 2, 4, 8, 16, 32 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=3"
-
-    # 1P2D TP8
-    - spec-decoding: "mtp"
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
-
-    # 1P2D TP4
-    - spec-decoding: "mtp"
-      conc-list: [ 64, 128, 256 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 1
-        dp-attn: false
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
-
-    # 4*DEP4 + 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048, 4096 ]
-      prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-
-dsr1-fp4-mi355x-sglang-disagg-v2:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
@@ -1656,7 +1236,7 @@ dsr1-fp4-mi355x-sglang-disagg-v2:
         - "DECODE_MTP_SIZE=0"
 
 
-dsr1-fp4-mi355x-sglang-disagg-mtp-v2:
+dsr1-fp4-mi355x-sglang-disagg-mtp:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1

From 682a4ab4ec3d42c73cd5c54b9aede2ba1fc33a54 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 30 Mar 2026 02:03:58 +0000
Subject: [PATCH 09/55] bump fp8 image to 0327

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 27518d40b..a139ca560 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -486,7 +486,7 @@ dsr1-fp8-mi355x-atom-mtp:
     - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp  }
 
 dsr1-fp8-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -641,7 +641,7 @@ dsr1-fp8-mi355x-sglang-disagg:
 
 
 dsr1-fp8-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0323
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi355x-disagg

From 64bf10078c4e4f9f19486dc0f6727dc6ef1902d2 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 30 Mar 2026 04:36:27 +0000
Subject: [PATCH 10/55] fix crash

---
 benchmarks/multi_node/amd_utils/server.sh | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 7f174b760..7340ef51c 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -187,18 +187,8 @@ else
     decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
 fi
 
-# Use Decode configuration to configure different TP/DP size between P and D
-PREFILL_DECODE_DIFFERENT_TP=""
-if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then
-    if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
-        PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}"
-    else
-        PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1"
-    fi
-fi
-
 # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
-PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}"
+PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
 if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
     PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
 fi

From c44e1755ea6cc81f5e6f59b071ed20ddb7abefe4 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 30 Mar 2026 15:26:57 +0000
Subject: [PATCH 11/55] fix env

---
 benchmarks/multi_node/amd_utils/env.sh | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 02cb77a91..88ea2ac84 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -88,17 +88,21 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
     if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
         TC=$(( 4 * ND_DSCP ))
         export MORI_RDMA_SL=$ND_PRIO
+        export MORI_IO_SL=$ND_PRIO
         export MORI_RDMA_TC=$TC
-        echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL"
+        export MORI_IO_TC=$TC
+        echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL"
     else
         echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
         # Fall back to hostname-based detection
         NODENAME=$(hostname -s)
         if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
             export MORI_RDMA_TC=96
+            export MORI_IO_TC=96
             echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
         elif [[ $NODENAME == mia1* ]]; then
             export MORI_RDMA_TC=104
+            export MORI_IO_TC=104
             echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
         else
             echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
@@ -109,9 +113,11 @@ else
     NODENAME=$(hostname -s)
     if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
         export MORI_RDMA_TC=96
+        export MORI_IO_TC=96
         echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
     elif [[ $NODENAME == mia1* ]]; then
         export MORI_RDMA_TC=104
+        export MORI_IO_TC=104
         echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
     else
         echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."

From 0a41f8980559717d1e2544ac013048dbb85b8c94 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 31 Mar 2026 06:50:28 +0000
Subject: [PATCH 12/55] cleanup

---
 .github/configs/amd-master.yaml | 123 --------------------------------
 1 file changed, 123 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index a139ca560..14eec1583 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -794,129 +794,6 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=2"
 
-  # FIXME(billishyahao): disable 1k8k for now
-  # - isl: 1024
-  #   osl: 8192
-  #   search-space:
-  #   # MTP configurations
-  #   # "Top of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 2048 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 16
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-
-  #   # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 256, 512, 1024 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-
-  #   # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 32, 64, 128 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-
-  #     decode:
-  #       num-worker: 2
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=1"
-
-  #   # non-MTP configurations
-  #   # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 2048 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 16
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
-  #   # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 256, 512, 1024 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 1
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
-  #   # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8)
-  #   - spec-decoding: "none"
-  #     conc-list: [ 32, 64, 128 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 2
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: false
-  #       additional-settings:
-  #       - "DECODE_NODES=2"
-  #       - "DECODE_MTP_SIZE=0"
-
 dsr1-fp4-mi355x-sglang-disagg:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
   model: amd/DeepSeek-R1-0528-MXFP4-v2

From 7282748ed6da9d902f737b8843f0599f01546d26 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 31 Mar 2026 06:54:30 +0000
Subject: [PATCH 13/55] add perf change log

---
 perf-changelog.yaml | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 3dbc5eccc..1cd22211a 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1213,3 +1213,13 @@
     - "Uses nvidia/GLM-5-NVFP4 model with modelopt_fp4 quantization"
     - "Image: lmsysorg/sglang:nightly-dev-cu13-20260328-a27651d5"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/973
+
+- config-keys:
+    - dsr1-fp4-mi355x-sglang-disagg
+    - dsr1-fp4-mi355x-sglang-disagg-mtp
+  description:
+    - "Bump SGL mori image to March 27"
+    - "Add more low latency sweep configs"
+    - "Enable v2 mxfp4 DSR1 0528 model"
+    - "Enable fp4 disp feature on mori"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/983

From e6d4b3255d079f7de7ad13367120d521cb5d02a7 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 31 Mar 2026 08:14:42 +0000
Subject: [PATCH 14/55] add deprecate comments

---
 benchmarks/multi_node/amd_utils/env.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 88ea2ac84..0aa2d0c20 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -63,6 +63,8 @@ export MORI_MAX_DISPATCH_TOKENS_DECODE=160
 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
 
 export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
+
+#TODO(billishyahao): The following IO env will be deprecated soon.
 export MORI_IO_QP_MAX_SEND_WR=16384
 export MORI_IO_QP_MAX_CQE=32768
 export MORI_IO_QP_MAX_SGE=4

From b7dd65f146b3aeea9d0592a0164d003312ece3c1 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Wed, 1 Apr 2026 13:21:09 +0000
Subject: [PATCH 15/55] add spec v2 env

---
 .github/configs/amd-master.yaml        | 4 ++--
 benchmarks/multi_node/amd_utils/env.sh | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 14eec1583..14577525c 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-2
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-2
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 0aa2d0c20..d0fa8aa9d 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -52,6 +52,10 @@ fi
 export SGLANG_MORI_FP4_DISP=False
 export SGLANG_MORI_FP8_COMB=False
 
+# Enable spec v2 
+export SGLANG_ENABLE_SPEC_V2=1
+export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
+
 # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower)
 export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384
 if [[ "$MODEL_NAME" == *mxfp4* ]]; then

From 12a4ba0ab618385daf26355aa9bfa28cd9432a4f Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Thu, 2 Apr 2026 15:04:45 +0000
Subject: [PATCH 16/55] bump the docker image

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 14577525c..18131ee9f 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-2
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-2
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg

From 597a458e352859d036c8bef4c0f37145736fe58b Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Thu, 9 Apr 2026 08:55:37 +0000
Subject: [PATCH 17/55] add stream control to eliminate cpu overhead

---
 .github/configs/amd-master.yaml           | 113 ++++++++++++++++++++++
 benchmarks/multi_node/amd_utils/server.sh |   4 +-
 2 files changed, 115 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 18131ee9f..f5bc7390f 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1213,3 +1213,116 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+dsr1-fp4-mi355x-sglang-disagg-exp:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 1*DEP4+ 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 1024, 2048 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 1024, 2048, 4096 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+
+dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 1*DEP4+ 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1024, 2048 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1024, 2048, 4096 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
\ No newline at end of file
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 7340ef51c..e27e036f9 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -188,12 +188,12 @@ else
 fi
 
 # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
-PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
+PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} --tokenizer-worker-num 32 "
 if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
     PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
 fi
 
-DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}"
+DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 32 --stream-interval 2"
 if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then
     DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance"
 fi

From f715e47ba79972dfd5035ac0ba6ded0adb4e9452 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 10 Apr 2026 15:01:07 +0000
Subject: [PATCH 18/55] tune the config

---
 .github/configs/amd-master.yaml             | 4 ++--
 benchmarks/multi_node/amd_utils/env.sh      | 8 +++++++-
 benchmarks/multi_node/amd_utils/models.yaml | 4 ++--
 benchmarks/multi_node/amd_utils/server.sh   | 4 ++--
 4 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index f5bc7390f..e4bc8178d 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1252,7 +1252,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
     # non-MTP configurations
     # 4*DEP4 + 1*DEP8
     - spec-decoding: "none"
-      conc-list: [ 1024, 2048, 4096 ]
+      conc-list: [ 512, 1024, 2048 ]
       prefill:
         num-worker: 4
         tp: 4
@@ -1310,7 +1310,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
     # MTP configurations
     # 4*DEP4 + 1*DEP8
     - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048, 4096 ]
+      conc-list: [ 512, 1024, 2048 ]
       prefill:
         num-worker: 4
         tp: 4
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index d0fa8aa9d..ee9cd0087 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -56,12 +56,17 @@ export SGLANG_MORI_FP8_COMB=False
 export SGLANG_ENABLE_SPEC_V2=1
 export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
 
+export SGLANG_LOG_MS=true
+export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
+export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600
+export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600
+
 # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower)
 export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384
 if [[ "$MODEL_NAME" == *mxfp4* ]]; then
     export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288
 fi
-export MORI_MAX_DISPATCH_TOKENS_DECODE=160
+export MORI_MAX_DISPATCH_TOKENS_DECODE=512
 
 # set MTP size=1 when EP16
 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
@@ -73,6 +78,7 @@ export MORI_IO_QP_MAX_SEND_WR=16384
 export MORI_IO_QP_MAX_CQE=32768
 export MORI_IO_QP_MAX_SGE=4
 
+
 export MORI_APP_LOG_LEVEL=INFO
 
 # Router logging control:
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index eed59bdab..3e8af0266 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -231,7 +231,7 @@ DeepSeek-R1-0528-MXFP4-v2:
     mem_fraction_static: 0.8
     disable_radix_cache: true
     dp:
-      max_running_requests: 24
+      max_running_requests: 32
       chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
       cuda_graph_bs: "1 2 3"
     no_dp:
@@ -244,7 +244,7 @@ DeepSeek-R1-0528-MXFP4-v2:
     dp:
       max_running_requests: 4096
       chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
-      cuda_graph_bs_range: "1-160"
+      cuda_graph_bs_range: "1-512"
     ep_only:
       max_running_requests: 256
       chunked_prefill_size: 262144
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index e27e036f9..141dc0d7d 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -188,12 +188,12 @@ else
 fi
 
 # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
-PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} --tokenizer-worker-num 32 "
+PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} --tokenizer-worker-num 8 "
 if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
     PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
 fi
 
-DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 32 --stream-interval 2"
+DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 8 --stream-interval 3"
 if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then
     DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance"
 fi

From 2ea82d5a9f18d88f059bc1c30a606d8028effb48 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sat, 11 Apr 2026 01:28:36 +0000
Subject: [PATCH 19/55] bump image

---
 .github/configs/amd-master.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index e4bc8178d..924975932 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -985,7 +985,7 @@ dsr1-fp4-mi355x-sglang-disagg:
 
     # 4*DEP4 + 1*DEP8
     - spec-decoding: "none"
-      conc-list: [ 1024, 2048, 4096 ]
+      conc-list: [ 512, 1024, 2048 ]
       prefill:
         num-worker: 4
         tp: 4
@@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1196,7 +1196,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
 
     # 4*DEP4 + 1*DEP8
     - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048, 4096 ]
+      conc-list: [ 512, 1024, 2048 ]
       prefill:
         num-worker: 4
         tp: 4

From 16384e7f11bc253d993c619c9cc27cecc5ef61c0 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sat, 11 Apr 2026 16:15:29 +0000
Subject: [PATCH 20/55] tune config

---
 .github/configs/amd-master.yaml           | 155 +++++++++++++++++++---
 benchmarks/multi_node/amd_utils/server.sh |   6 +-
 2 files changed, 140 insertions(+), 21 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 924975932..064312b7d 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1214,7 +1214,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=1"
 
 dsr1-fp4-mi355x-sglang-disagg-exp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1227,36 +1227,94 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
     osl: 1024
     search-space:
     # non-MTP configurations
-    # 1*DEP4+ 1*DEP8
+    # 1*DEP8+ 2*DEP8
     - spec-decoding: "none"
-      conc-list: [ 1024, 2048 ]
+      conc-list: [ 512, 1024, 2048]
       prefill:
         num-worker: 1
-        tp: 4
-        ep: 4
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1*DEP8+ 3*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048]
+      prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=1"
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=3"
         - "DECODE_MTP_SIZE=0"
 
+
   - isl: 8192
     osl: 1024
     search-space:
     # non-MTP configurations
-    # 4*DEP4 + 1*DEP8
+    # 2*DEP8 + 1*DEP8
     - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048 ]
+      conc-list: [ 512, 1024, 2048]
+      prefill:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=2"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # 3*DEP8 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=3"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # 4*DEP8 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048]
       prefill:
         num-worker: 4
-        tp: 4
-        ep: 4
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=4"
@@ -1271,7 +1329,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327-3
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1284,23 +1342,42 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
     osl: 1024
     search-space:
     # MTP configurations
-    # 1*DEP4+ 1*DEP8
+    # 1*DEP8+ 2*DEP8
     - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048 ]
+      conc-list: [ 512, 1024, 2048]
       prefill:
         num-worker: 1
-        tp: 4
-        ep: 4
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
       decode:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=2"
+        - "DECODE_MTP_SIZE=1"
+
+    # 1*DEP8+ 3*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 512, 1024, 2048]
+      prefill:
         num-worker: 1
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=1"
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=3"
         - "DECODE_MTP_SIZE=1"
 
 
@@ -1308,13 +1385,51 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
     osl: 1024
     search-space:
     # MTP configurations
-    # 4*DEP4 + 1*DEP8
+    # 2*DEP8 + 1*DEP8
     - spec-decoding: "mtp"
-      conc-list: [ 512, 1024, 2048 ]
+      conc-list: [ 512, 1024, 2048]
+      prefill:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=2"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
+    # 3*DEP8 + 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 512, 1024, 2048]
+      prefill:
+        num-worker: 3
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=3"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
+    # 4*DEP8 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048]
       prefill:
         num-worker: 4
-        tp: 4
-        ep: 4
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=4"
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 141dc0d7d..e8d1f09f9 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -193,7 +193,11 @@ if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_
     PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
 fi
 
-DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 8 --stream-interval 3"
+DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 8"
+if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
+    DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS  --stream-interval 3"
+fi
+
 if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then
     DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance"
 fi

From 4d733e783d2a4dc91b0f506058982c6f2462b90a Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 13 Apr 2026 05:34:57 +0000
Subject: [PATCH 21/55] add new exp config

---
 .github/configs/amd-master.yaml           | 71 ++++++++++++++++++++++-
 benchmarks/multi_node/amd_utils/server.sh |  4 +-
 2 files changed, 71 insertions(+), 4 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 064312b7d..f76176406 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1213,7 +1213,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
-dsr1-fp4-mi355x-sglang-disagg-exp:
+dsr1-fp4-mi355x-sglang-disagg-exp-april12:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
@@ -1328,7 +1328,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
         - "DECODE_MTP_SIZE=0"
 
 
-dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
+dsr1-fp4-mi355x-sglang-disagg-mtp-exp-april12:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
@@ -1433,6 +1433,73 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
+dsr1-fp4-mi355x-sglang-disagg-exp:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+
+dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 512, 1024, 2048 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
       decode:
         num-worker: 1
         tp: 8
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index e8d1f09f9..025b943cc 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -188,12 +188,12 @@ else
 fi
 
 # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
-PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} --tokenizer-worker-num 8 "
+PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} --tokenizer-worker-num 32 "
 if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
     PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
 fi
 
-DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 8"
+DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 32"
 if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
     DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS  --stream-interval 3"
 fi

From 83af74381c01a3282b99415391bab27390a966d8 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 13 Apr 2026 08:27:15 +0000
Subject: [PATCH 22/55] enable log level info

---
 .github/configs/amd-master.yaml             | 107 ++++++++++++++++++++
 benchmarks/multi_node/amd_utils/models.yaml |  14 +--
 benchmarks/multi_node/amd_utils/server.sh   |   9 +-
 3 files changed, 117 insertions(+), 13 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index f76176406..401f2ed3e 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1442,6 +1442,75 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp-april12:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+dsr1-fp4-mi355x-sglang-disagg-exp-april13:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+
+dsr1-fp4-mi355x-sglang-disagg-mtp-exp-april13:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 512, 1024, 2048 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
+
+
 dsr1-fp4-mi355x-sglang-disagg-exp:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
@@ -1475,6 +1544,25 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
+    # 2*DEP8 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048]
+      prefill:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=2"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
 
 dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
@@ -1500,6 +1588,25 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
+    # 2*DEP8 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048]
+      prefill:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=2"
       decode:
         num-worker: 1
         tp: 8
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 3e8af0266..36c1ea707 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -38,7 +38,7 @@
 #         cuda_graph_bs_range: str
 
 DeepSeek-V3:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -69,7 +69,7 @@ DeepSeek-V3:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-V3-0324:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -100,7 +100,7 @@ DeepSeek-V3-0324:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -131,7 +131,7 @@ DeepSeek-R1:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -162,7 +162,7 @@ DeepSeek-R1-0528:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4-Preview:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -224,7 +224,7 @@ DeepSeek-R1-0528-MXFP4:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4-v2:
-  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 025b943cc..9e3714c6f 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -368,8 +368,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
         --host 0.0.0.0 \
         --port 8000 \
         --trust-remote-code \
-        ${PREFILL_SERVER_CONFIG} \
-        --log-level-http warning"
+        ${PREFILL_SERVER_CONFIG} "
 
     if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
         PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0"
@@ -498,8 +497,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
         --host 0.0.0.0 \
         --port 8000 \
         --trust-remote-code \
-        ${PREFILL_SERVER_CONFIG} \
-        --log-level-http warning"
+        ${PREFILL_SERVER_CONFIG} "
 
     if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
         rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER))
@@ -561,8 +559,7 @@ else
         --host 0.0.0.0 \
         --port 8000 \
         --trust-remote-code \
-        ${DECODE_SERVER_CONFIG} \
-        --log-level-http warning"
+        ${DECODE_SERVER_CONFIG} "
 
     if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then
         rank=$((RANK % DECODE_NODES_PER_WORKER))

From 0c3083e29d7f742ae09845aa39981fde029df954 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 13 Apr 2026 09:50:26 +0000
Subject: [PATCH 23/55] fix mori env

---
 benchmarks/multi_node/amd_utils/env.sh | 26 ++++++++------------------
 1 file changed, 8 insertions(+), 18 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index ee9cd0087..c84af0055 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -38,19 +38,15 @@ export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head
 export NCCL_IB_HCA=$IBDEVICES
 
 export SGLANG_USE_AITER=1
-export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200
-export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200
 
-# Disable allocating memory in one pass
-export MORI_SHMEM_MODE=ISOLATION
-export SGLANG_MORI_FP8_DISP=True
+export SGLANG_MORI_DISPATCH_DTYPE=auto
+export SGLANG_MORI_FP8_COMB=true
 
-if [[ "$MODEL_NAME" == *mxfp4* ]]; then
-export SGLANG_MORI_FP8_DISP=False
-fi
+export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600
+export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600
 
-export SGLANG_MORI_FP4_DISP=False
-export SGLANG_MORI_FP8_COMB=False
+# Disable allocating memory in one pass
+export MORI_SHMEM_MODE=ISOLATION
 
 # Enable spec v2 
 export SGLANG_ENABLE_SPEC_V2=1
@@ -58,12 +54,11 @@ export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
 
 export SGLANG_LOG_MS=true
 export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
-export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600
-export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600
+
 
 # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower)
 export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384
-if [[ "$MODEL_NAME" == *mxfp4* ]]; then
+if [[ "$MODEL_NAME" == *mxfp4* || "$MODEL_NAME" == *MXFP4* ]]; then
     export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288
 fi
 export MORI_MAX_DISPATCH_TOKENS_DECODE=512
@@ -73,11 +68,6 @@ export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_T
 
 export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
 
-#TODO(billishyahao): The following IO env will be deprecated soon.
-export MORI_IO_QP_MAX_SEND_WR=16384
-export MORI_IO_QP_MAX_CQE=32768
-export MORI_IO_QP_MAX_SGE=4
-
 
 export MORI_APP_LOG_LEVEL=INFO
 

From 1c61622541b61707961ef478e031a66d680ef9c2 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 13 Apr 2026 12:36:08 +0000
Subject: [PATCH 24/55] bump image

---
 .github/configs/amd-master.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 401f2ed3e..239fc399e 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1512,7 +1512,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp-april13:
 
 
 dsr1-fp4-mi355x-sglang-disagg-exp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1565,7 +1565,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg

From e2d2ac99cef73197204f55351eb45016505f4cdf Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 13 Apr 2026 12:57:07 +0000
Subject: [PATCH 25/55] fix log

---
 benchmarks/multi_node/amd_utils/models.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 36c1ea707..06ede97dd 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -38,7 +38,7 @@
 #         cuda_graph_bs_range: str
 
 DeepSeek-V3:
-  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -69,7 +69,7 @@ DeepSeek-V3:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-V3-0324:
-  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -100,7 +100,7 @@ DeepSeek-V3-0324:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1:
-  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -131,7 +131,7 @@ DeepSeek-R1:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528:
-  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -162,7 +162,7 @@ DeepSeek-R1-0528:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4-Preview:
-  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4:
-  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -224,7 +224,7 @@ DeepSeek-R1-0528-MXFP4:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4-v2:
-  base_flags: "--decode-log-interval 1 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:

From d2a7988e47c7ab71febe9f09a3860262234db662 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 14 Apr 2026 13:58:15 +0000
Subject: [PATCH 26/55] bump the image

---
 .github/configs/amd-master.yaml             | 406 +-------------------
 benchmarks/multi_node/amd_utils/env.sh      |  15 +-
 benchmarks/multi_node/amd_utils/models.yaml |   6 +-
 benchmarks/multi_node/amd_utils/server.sh   |  11 +-
 4 files changed, 20 insertions(+), 418 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 239fc399e..44c8df96d 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1213,405 +1213,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
-dsr1-fp4-mi355x-sglang-disagg-exp-april12:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    # non-MTP configurations
-    # 1*DEP8+ 2*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=0"
-
-    # 1*DEP8+ 3*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 3
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=3"
-        - "DECODE_MTP_SIZE=0"
-
-
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # non-MTP configurations
-    # 2*DEP8 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    # 3*DEP8 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048]
-      prefill:
-        num-worker: 3
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=3"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    # 4*DEP8 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048]
-      prefill:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-
-dsr1-fp4-mi355x-sglang-disagg-mtp-exp-april12:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    # MTP configurations
-    # 1*DEP8+ 2*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024, 2048]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=2"
-        - "DECODE_MTP_SIZE=1"
-
-    # 1*DEP8+ 3*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024, 2048]
-      prefill:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 3
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=3"
-        - "DECODE_MTP_SIZE=1"
-
-
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # MTP configurations
-    # 2*DEP8 + 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024, 2048]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-    # 3*DEP8 + 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024, 2048]
-      prefill:
-        num-worker: 3
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=3"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-    # 4*DEP8 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048]
-      prefill:
-        num-worker: 4
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-dsr1-fp4-mi355x-sglang-disagg-exp-april13:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # non-MTP configurations
-    # 4*DEP4 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-
-dsr1-fp4-mi355x-sglang-disagg-mtp-exp-april13:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # MTP configurations
-    # 4*DEP4 + 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-
-
-dsr1-fp4-mi355x-sglang-disagg-exp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # non-MTP configurations
-    # 4*DEP4 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    # 2*DEP8 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-
-dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0413
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # MTP configurations
-    # 4*DEP4 + 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-    # 2*DEP8 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
\ No newline at end of file
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index c84af0055..a4fe1d72b 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -41,6 +41,12 @@ export SGLANG_USE_AITER=1
 
 export SGLANG_MORI_DISPATCH_DTYPE=auto
 export SGLANG_MORI_FP8_COMB=true
+export SGLANG_MORI_QP_PER_TRANSFER=2
+export SGLANG_MORI_NUM_WORKERS=2
+
+export MORI_IO_QP_MAX_SEND_WR=16384
+export MORI_IO_QP_MAX_CQE=32768 
+export MORI_IO_QP_MAX_SGE=4
 
 export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600
 export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600
@@ -57,10 +63,11 @@ export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
 
 
 # Per-role dispatch token limits (prefill uses higher throughput, decode uses lower)
-export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384
-if [[ "$MODEL_NAME" == *mxfp4* || "$MODEL_NAME" == *MXFP4* ]]; then
-    export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288
-fi
+# export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384
+# if [[ "$MODEL_NAME" == *mxfp4* || "$MODEL_NAME" == *MXFP4* ]]; then
+#     export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288
+# fi
+export MORI_MAX_DISPATCH_TOKENS_PREFILL=2048
 export MORI_MAX_DISPATCH_TOKENS_DECODE=512
 
 # set MTP size=1 when EP16
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 06ede97dd..f5faf5935 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -226,7 +226,7 @@ DeepSeek-R1-0528-MXFP4:
 DeepSeek-R1-0528-MXFP4-v2:
   base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
-  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 3 --tokenizer-worker-num 32 "
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true
@@ -242,9 +242,9 @@ DeepSeek-R1-0528-MXFP4-v2:
     mem_fraction_static: 0.85
     prefill_round_robin_balance: true
     dp:
-      max_running_requests: 4096
+      max_running_requests: 2048
       chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
-      cuda_graph_bs_range: "1-512"
+      cuda_graph_bs_range: "1-256"
     ep_only:
       max_running_requests: 256
       chunked_prefill_size: 262144
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 9e3714c6f..3c224a872 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -188,15 +188,12 @@ else
 fi
 
 # Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
-PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} --tokenizer-worker-num 32 "
+PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
 if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
     PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
 fi
 
-DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} --tokenizer-worker-num 32"
-if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
-    DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS  --stream-interval 3"
-fi
+DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} "
 
 if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then
     DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance"
@@ -356,8 +353,8 @@ if [ "$NODE_RANK" -eq 0 ]; then
     echo "Decode  parallelism: TP=${DECODE_TP_SIZE},  EP enabled: ${DECODE_ENABLE_EP},  DP enabled: ${DECODE_ENABLE_DP},  MTP size=${DECODE_MTP_SIZE}"
     echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}"
     echo "Decode servers  ($((DECODE_TP_SIZE/GPUS_PER_NODE))  nodes): ${DECODE_ARGS}"
-    echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}"
-    echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}"
+    echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}"
+    echo "Decode  env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} "
     echo "================================================"
 
     # start the head prefill server

From b09ae6cb96952c1dd3c1b166b301152881cd14ce Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 14 Apr 2026 14:09:55 +0000
Subject: [PATCH 27/55] fix

---
 benchmarks/multi_node/amd_utils/models.yaml | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index f5faf5935..317352365 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -38,7 +38,7 @@
 #         cuda_graph_bs_range: str
 
 DeepSeek-V3:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -69,7 +69,7 @@ DeepSeek-V3:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-V3-0324:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -100,7 +100,7 @@ DeepSeek-V3-0324:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -131,7 +131,7 @@ DeepSeek-R1:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -162,7 +162,7 @@ DeepSeek-R1-0528:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4-Preview:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -224,7 +224,7 @@ DeepSeek-R1-0528-MXFP4:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4-v2:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600  --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 3 --tokenizer-worker-num 32 "
   prefill:

From 2c3ee04fe5913dfd72eef3ec40740bc315682c7d Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 14 Apr 2026 14:27:19 +0000
Subject: [PATCH 28/55] fix

---
 benchmarks/multi_node/amd_utils/env.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index a4fe1d72b..a944a3d00 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -68,7 +68,7 @@ export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
 #     export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288
 # fi
 export MORI_MAX_DISPATCH_TOKENS_PREFILL=2048
-export MORI_MAX_DISPATCH_TOKENS_DECODE=512
+export MORI_MAX_DISPATCH_TOKENS_DECODE=256
 
 # set MTP size=1 when EP16
 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))

From 69102f7a4838303253cc705b3f4021a26f6ecc09 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Wed, 15 Apr 2026 15:28:42 +0000
Subject: [PATCH 29/55] fix

---
 .github/configs/amd-master.yaml           | 4 ++--
 benchmarks/multi_node/amd_utils/server.sh | 4 +++-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 44c8df96d..ece23090d 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0415
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0415
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 3c224a872..18b0bc7ea 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -355,6 +355,8 @@ if [ "$NODE_RANK" -eq 0 ]; then
     echo "Decode servers  ($((DECODE_TP_SIZE/GPUS_PER_NODE))  nodes): ${DECODE_ARGS}"
     echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}"
     echo "Decode  env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} "
+    echo "Decode  env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${DECODE_CUDA_GRAPH_BS_DP_END} "
+
     echo "================================================"
 
     # start the head prefill server
@@ -549,7 +551,7 @@ else
     echo "Decode node rank: $RANK"
     echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}"
 
-    DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
+    DECODE_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${DECODE_CUDA_GRAPH_BS_DP_END} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
         --model-path ${MODEL_DIR}/${MODEL_NAME} \
         --disaggregation-mode decode \
         --disaggregation-ib-device ${IBDEVICES} \

From 668068c8c8669de8e99466af2bec944f9d89355a Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Thu, 16 Apr 2026 05:22:08 +0000
Subject: [PATCH 30/55] fix

---
 benchmarks/multi_node/amd_utils/env.sh    | 1 +
 benchmarks/multi_node/amd_utils/server.sh | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index a944a3d00..5071ec62d 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -69,6 +69,7 @@ export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
 # fi
 export MORI_MAX_DISPATCH_TOKENS_PREFILL=2048
 export MORI_MAX_DISPATCH_TOKENS_DECODE=256
+export SGLANG_MORI_MOE_MAX_INPUT_TOKENS=2048
 
 # set MTP size=1 when EP16
 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 18b0bc7ea..18518f4d2 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -551,7 +551,7 @@ else
     echo "Decode node rank: $RANK"
     echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}"
 
-    DECODE_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${DECODE_CUDA_GRAPH_BS_DP_END} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
+    DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
         --model-path ${MODEL_DIR}/${MODEL_NAME} \
         --disaggregation-mode decode \
         --disaggregation-ib-device ${IBDEVICES} \

From 776fd425871777ad4cfb180896aa88ddb290e0d1 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Thu, 16 Apr 2026 15:28:49 +0000
Subject: [PATCH 31/55] bump image to 0416

---
 .github/configs/amd-master.yaml           | 4 ++--
 benchmarks/multi_node/amd_utils/server.sh | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index ece23090d..f5705293b 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0415
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0415
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 18518f4d2..4e1164b24 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -201,6 +201,7 @@ fi
 
 if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
     MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
+    SGLANG_MORI_MOE_MAX_INPUT_TOKENS=$((SGLANG_MORI_MOE_MAX_INPUT_TOKENS * (DECODE_MTP_SIZE + 1)))
 fi
 
 # =============================================================================
@@ -355,7 +356,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
     echo "Decode servers  ($((DECODE_TP_SIZE/GPUS_PER_NODE))  nodes): ${DECODE_ARGS}"
     echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}"
     echo "Decode  env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} "
-    echo "Decode  env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${DECODE_CUDA_GRAPH_BS_DP_END} "
+    echo "Decode  env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${SGLANG_MORI_MOE_MAX_INPUT_TOKENS} "
 
     echo "================================================"
 
@@ -551,7 +552,7 @@ else
     echo "Decode node rank: $RANK"
     echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}"
 
-    DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
+    DECODE_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${SGLANG_MORI_MOE_MAX_INPUT_TOKENS} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
         --model-path ${MODEL_DIR}/${MODEL_NAME} \
         --disaggregation-mode decode \
         --disaggregation-ib-device ${IBDEVICES} \

From 2471379ae51e9d85370ace8c3717e0104d589232 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 17 Apr 2026 06:29:55 +0000
Subject: [PATCH 32/55] fix

---
 .github/configs/amd-master.yaml           | 114 ++++++++++++++++++++++
 benchmarks/multi_node/amd_utils/env.sh    |   2 +-
 benchmarks/multi_node/amd_utils/server.sh |   6 +-
 3 files changed, 118 insertions(+), 4 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index f5705293b..ad4b3f559 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1213,3 +1213,117 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+dsr1-fp4-mi355x-sglang-disagg-exp:
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 1*DEP4+ 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 1024, 2048 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+
+dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 1*DEP4+ 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1024, 2048 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 512, 1024, 2048 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 5071ec62d..36361bb7b 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -69,7 +69,7 @@ export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
 # fi
 export MORI_MAX_DISPATCH_TOKENS_PREFILL=2048
 export MORI_MAX_DISPATCH_TOKENS_DECODE=256
-export SGLANG_MORI_MOE_MAX_INPUT_TOKENS=2048
+export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2048
 
 # set MTP size=1 when EP16
 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 4e1164b24..c7ab4d4ac 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -201,7 +201,7 @@ fi
 
 if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
     MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
-    SGLANG_MORI_MOE_MAX_INPUT_TOKENS=$((SGLANG_MORI_MOE_MAX_INPUT_TOKENS * (DECODE_MTP_SIZE + 1)))
+    MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
 fi
 
 # =============================================================================
@@ -356,7 +356,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
     echo "Decode servers  ($((DECODE_TP_SIZE/GPUS_PER_NODE))  nodes): ${DECODE_ARGS}"
     echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}"
     echo "Decode  env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} "
-    echo "Decode  env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${SGLANG_MORI_MOE_MAX_INPUT_TOKENS} "
+    echo "Decode  env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} "
 
     echo "================================================"
 
@@ -552,7 +552,7 @@ else
     echo "Decode node rank: $RANK"
     echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}"
 
-    DECODE_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${SGLANG_MORI_MOE_MAX_INPUT_TOKENS} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
+    DECODE_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
         --model-path ${MODEL_DIR}/${MODEL_NAME} \
         --disaggregation-mode decode \
         --disaggregation-ib-device ${IBDEVICES} \

From c80997fddcd5f3c8a23c37d477f7bb6caf231277 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 17 Apr 2026 10:31:52 +0000
Subject: [PATCH 33/55] set si to 100

---
 benchmarks/multi_node/amd_utils/models.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 317352365..eefc93920 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -226,7 +226,7 @@ DeepSeek-R1-0528-MXFP4:
 DeepSeek-R1-0528-MXFP4-v2:
   base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
-  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 3 --tokenizer-worker-num 32 "
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 "
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true

From 616c57deaa094dafa358931daea2b9c703e97cf7 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sat, 18 Apr 2026 08:08:03 +0000
Subject: [PATCH 34/55] bump the image

---
 .github/configs/amd-master.yaml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index ad4b3f559..e3995511e 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1214,7 +1214,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=1"
 
 dsr1-fp4-mi355x-sglang-disagg-exp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1271,7 +1271,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0416
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg

From 3d62e2c6d8ce0066abeb6eb2426f0150ea000f82 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sun, 19 Apr 2026 15:06:50 +0000
Subject: [PATCH 35/55] revert old image

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index e3995511e..8f1fb5efa 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1214,7 +1214,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=1"
 
 dsr1-fp4-mi355x-sglang-disagg-exp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0408-high-concurrency
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1271,7 +1271,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0408-high-concurrency
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg

From 2c4c09d97ac0efd2a3dd1255f9a063923e2faed4 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sun, 19 Apr 2026 15:09:23 +0000
Subject: [PATCH 36/55] revert old image

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 8f1fb5efa..d844c8ecf 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1214,7 +1214,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=1"
 
 dsr1-fp4-mi355x-sglang-disagg-exp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0408-high-concurrency
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1271,7 +1271,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0408-high-concurrency
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg

From 1c9b8d2a95259b4325e579cb10f40b84a78f05e5 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 20 Apr 2026 06:14:43 +0000
Subject: [PATCH 37/55] increase DISPATCH_TOKENS_PREFILL to 5120

---
 benchmarks/multi_node/amd_utils/env.sh | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 36361bb7b..c751078ec 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -67,7 +67,8 @@ export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
 # if [[ "$MODEL_NAME" == *mxfp4* || "$MODEL_NAME" == *MXFP4* ]]; then
 #     export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288
 # fi
-export MORI_MAX_DISPATCH_TOKENS_PREFILL=2048
+
+export MORI_MAX_DISPATCH_TOKENS_PREFILL=5120
 export MORI_MAX_DISPATCH_TOKENS_DECODE=256
 export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2048
 

From 8e6104eb526bf5ddd6023e3f3616e761a52ef7a1 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Mon, 20 Apr 2026 06:17:56 +0000
Subject: [PATCH 38/55] bump image to 0417

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index d844c8ecf..e3995511e 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1214,7 +1214,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=1"
 
 dsr1-fp4-mi355x-sglang-disagg-exp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1271,7 +1271,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg

From 7cc5d81728877f4f2067e85fd6de18a7039c6a3a Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 21 Apr 2026 15:06:04 +0000
Subject: [PATCH 39/55] add exp config

---
 .github/configs/amd-master.yaml | 116 +++++++++++++++++++++++++-------
 1 file changed, 91 insertions(+), 25 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index e3995511e..de6ee6239 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1214,7 +1214,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=1"
 
 dsr1-fp4-mi355x-sglang-disagg-exp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1223,20 +1223,39 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
   multinode: true
   disagg: true
   seq-len-configs:
-  - isl: 1024
+  - isl: 8192
     osl: 1024
     search-space:
     # non-MTP configurations
-    # 1*DEP4+ 1*DEP8
+    # 4*DEP4 + 1*DEP8
     - spec-decoding: "none"
-      conc-list: [ 1024, 2048 ]
+      conc-list: [ 512, 1024, 2048 ]
       prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+      decode:
         num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # 3*DEP4 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048 ]
+      prefill:
+        num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=1"
+        - "PREFILL_NODES=3"
       decode:
         num-worker: 1
         tp: 8
@@ -1246,20 +1265,35 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # non-MTP configurations
-    # 4*DEP4 + 1*DEP8
+    # 2*DEP4 + 1*DEP8
     - spec-decoding: "none"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
-        num-worker: 4
+        num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=4"
+        - "PREFILL_NODES=2"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # 1*DEP8 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
       decode:
         num-worker: 1
         tp: 8
@@ -1271,7 +1305,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1280,20 +1314,20 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
   multinode: true
   disagg: true
   seq-len-configs:
-  - isl: 1024
+  - isl: 8192
     osl: 1024
     search-space:
     # MTP configurations
-    # 1*DEP4+ 1*DEP8
+    # 4*DEP4 + 1*DEP8
     - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048 ]
+      conc-list: [ 512, 1024, 2048 ]
       prefill:
-        num-worker: 1
+        num-worker: 4
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=1"
+        - "PREFILL_NODES=4"
       decode:
         num-worker: 1
         tp: 8
@@ -1303,21 +1337,35 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+    # 3*DEP4 + 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 512, 1024, 2048 ]
+      prefill:
+        num-worker: 3
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=3"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
 
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # MTP configurations
-    # 4*DEP4 + 1*DEP8
+    # 2*DEP4 + 1*DEP8
     - spec-decoding: "mtp"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
-        num-worker: 4
+        num-worker: 2
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=4"
+        - "PREFILL_NODES=2"
       decode:
         num-worker: 1
         tp: 8
@@ -1327,3 +1375,21 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+    # 1*DEP8 + 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 512, 1024, 2048 ]
+      prefill:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
\ No newline at end of file

From a1c05da8c0db2f43e9c346c22b4eaf608694da84 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Wed, 22 Apr 2026 11:06:42 +0000
Subject: [PATCH 40/55] add exp config

---
 .github/configs/amd-master.yaml             | 96 +++------------------
 benchmarks/multi_node/amd_utils/models.yaml |  2 +-
 2 files changed, 11 insertions(+), 87 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index de6ee6239..31bdcc6a5 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1227,53 +1227,15 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
     osl: 1024
     search-space:
     # non-MTP configurations
-    # 4*DEP4 + 1*DEP8
+    # 2*DEP8 + 1*DEP8
     - spec-decoding: "none"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    # 3*DEP4 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 3
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=3"
-      decode:
-        num-worker: 1
+        num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    # 2*DEP4 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 2
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
         - "PREFILL_NODES=2"
       decode:
         num-worker: 1
@@ -1284,13 +1246,13 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
-    # 1*DEP8 + 1*DEP8
+    # 1*DEP4 + 1*DEP8
     - spec-decoding: "none"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
         num-worker: 1
-        tp: 8
-        ep: 8
+        tp: 4
+        ep: 4
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
@@ -1318,53 +1280,15 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
     osl: 1024
     search-space:
     # MTP configurations
-    # 4*DEP4 + 1*DEP8
+    # 2*DEP8 + 1*DEP8
     - spec-decoding: "mtp"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-    # 3*DEP4 + 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 3
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=3"
-      decode:
-        num-worker: 1
+        num-worker: 2
         tp: 8
         ep: 8
         dp-attn: true
         additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-    # 2*DEP4 + 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 2
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
         - "PREFILL_NODES=2"
       decode:
         num-worker: 1
@@ -1375,13 +1299,13 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
-    # 1*DEP8 + 1*DEP8
+    # 1*DEP4 + 1*DEP8
     - spec-decoding: "mtp"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
         num-worker: 1
-        tp: 8
-        ep: 8
+        tp: 4
+        ep: 4
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=1"
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index eefc93920..b2b013244 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -226,7 +226,7 @@ DeepSeek-R1-0528-MXFP4:
 DeepSeek-R1-0528-MXFP4-v2:
   base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
-  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 "
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 --num-continuous-decode-steps=4 "
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true

From a915729352b1a1ead26d1b594fd362d89e09d0e3 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Thu, 23 Apr 2026 05:37:26 +0000
Subject: [PATCH 41/55] add exp config

---
 .github/configs/amd-master.yaml | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 31bdcc6a5..355fa141e 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1227,16 +1227,16 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
     osl: 1024
     search-space:
     # non-MTP configurations
-    # 2*DEP8 + 1*DEP8
+    # 4*DEP4 + 1*DEP8
     - spec-decoding: "none"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
+        num-worker: 4
+        tp: 4
+        ep: 4
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=2"
+        - "PREFILL_NODES=4"
       decode:
         num-worker: 1
         tp: 8
@@ -1246,16 +1246,16 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
-    # 1*DEP4 + 1*DEP8
+    # 3*DEP4 + 1*DEP8
     - spec-decoding: "none"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
-        num-worker: 1
+        num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=1"
+        - "PREFILL_NODES=3"
       decode:
         num-worker: 1
         tp: 8
@@ -1280,16 +1280,16 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
     osl: 1024
     search-space:
     # MTP configurations
-    # 2*DEP8 + 1*DEP8
+    # 4*DEP4 + 1*DEP8
     - spec-decoding: "mtp"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
+        num-worker: 4
+        tp: 4
+        ep: 4
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=2"
+        - "PREFILL_NODES=4"
       decode:
         num-worker: 1
         tp: 8
@@ -1299,16 +1299,16 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
-    # 1*DEP4 + 1*DEP8
+    # 3*DEP4 + 1*DEP8
     - spec-decoding: "mtp"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
-        num-worker: 1
+        num-worker: 3
         tp: 4
         ep: 4
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=1"
+        - "PREFILL_NODES=3"
       decode:
         num-worker: 1
         tp: 8

From 44d10a1adf7ebb5fb2114d57b9bd72140f028baf Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Thu, 23 Apr 2026 15:44:35 +0000
Subject: [PATCH 42/55] add exp config

---
 .github/configs/amd-master.yaml             | 20 ++++++++++----------
 benchmarks/multi_node/amd_utils/models.yaml |  2 +-
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 355fa141e..2550f9b70 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1246,16 +1246,16 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=0"
 
-    # 3*DEP4 + 1*DEP8
+    # 2*DEP8 + 1*DEP8
     - spec-decoding: "none"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
-        num-worker: 3
-        tp: 4
-        ep: 4
+        num-worker: 2
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=3"
+        - "PREFILL_NODES=2"
       decode:
         num-worker: 1
         tp: 8
@@ -1299,16 +1299,16 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
-    # 3*DEP4 + 1*DEP8
+    # 2*DEP8 + 1*DEP8
     - spec-decoding: "mtp"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
-        num-worker: 3
-        tp: 4
-        ep: 4
+        num-worker: 2
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=3"
+        - "PREFILL_NODES=2"
       decode:
         num-worker: 1
         tp: 8
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index b2b013244..c547b1174 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -226,7 +226,7 @@ DeepSeek-R1-0528-MXFP4:
 DeepSeek-R1-0528-MXFP4-v2:
   base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
-  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 --num-continuous-decode-steps=4 "
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 --num-continuous-decode-steps=4  --scheduler-recv-interval=4 "
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true

From f09820e7c5bb46805743ff5bf1fecd6d116aea6b Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 24 Apr 2026 02:16:20 +0000
Subject: [PATCH 43/55] add exp configs

---
 .github/configs/amd-master.yaml             | 135 +++++++++++++++++++-
 benchmarks/multi_node/amd_utils/models.yaml |   2 +-
 2 files changed, 129 insertions(+), 8 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 2550f9b70..8b9d7a594 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1266,6 +1266,112 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
         - "DECODE_MTP_SIZE=0"
 
 
+# dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
+#   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+#   model: amd/DeepSeek-R1-0528-MXFP4-v2
+#   model-prefix: dsr1
+#   runner: mi355x-disagg
+#   precision: fp4
+#   framework: sglang-disagg
+#   multinode: true
+#   disagg: true
+#   seq-len-configs:
+#   - isl: 8192
+#     osl: 1024
+#     search-space:
+#     # MTP configurations
+#     # 4*DEP4 + 1*DEP8
+#     - spec-decoding: "mtp"
+#       conc-list: [ 512, 1024, 2048 ]
+#       prefill:
+#         num-worker: 4
+#         tp: 4
+#         ep: 4
+#         dp-attn: true
+#         additional-settings:
+#         - "PREFILL_NODES=4"
+#       decode:
+#         num-worker: 1
+#         tp: 8
+#         ep: 8
+#         dp-attn: true
+#         additional-settings:
+#         - "DECODE_NODES=1"
+#         - "DECODE_MTP_SIZE=1"
+
+#     # 2*DEP8 + 1*DEP8
+#     - spec-decoding: "mtp"
+#       conc-list: [ 512, 1024, 2048 ]
+#       prefill:
+#         num-worker: 2
+#         tp: 8
+#         ep: 8
+#         dp-attn: true
+#         additional-settings:
+#         - "PREFILL_NODES=2"
+#       decode:
+#         num-worker: 1
+#         tp: 8
+#         ep: 8
+#         dp-attn: true
+#         additional-settings:
+#         - "DECODE_NODES=1"
+#         - "DECODE_MTP_SIZE=1"
+
+
+# dsr1-fp4-mi355x-sglang-disagg-exp:
+#   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+#   model: amd/DeepSeek-R1-0528-MXFP4-v2
+#   model-prefix: dsr1
+#   runner: mi355x-disagg
+#   precision: fp4
+#   framework: sglang-disagg
+#   multinode: true
+#   disagg: true
+#   seq-len-configs:
+#   - isl: 8192
+#     osl: 1024
+#     search-space:
+#     # non-MTP configurations
+#     # 4*DEP4 + 1*DEP8
+#     - spec-decoding: "none"
+#       conc-list: [ 512, 1024, 2048 ]
+#       prefill:
+#         num-worker: 4
+#         tp: 4
+#         ep: 4
+#         dp-attn: true
+#         additional-settings:
+#         - "PREFILL_NODES=4"
+#       decode:
+#         num-worker: 1
+#         tp: 8
+#         ep: 8
+#         dp-attn: true
+#         additional-settings:
+#         - "DECODE_NODES=1"
+#         - "DECODE_MTP_SIZE=0"
+
+#     # 2*DEP8 + 1*DEP8
+#     - spec-decoding: "none"
+#       conc-list: [ 512, 1024, 2048 ]
+#       prefill:
+#         num-worker: 2
+#         tp: 8
+#         ep: 8
+#         dp-attn: true
+#         additional-settings:
+#         - "PREFILL_NODES=2"
+#       decode:
+#         num-worker: 1
+#         tp: 8
+#         ep: 8
+#         dp-attn: true
+#         additional-settings:
+#         - "DECODE_NODES=1"
+#         - "DECODE_MTP_SIZE=0"
+
+
 dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
   model: amd/DeepSeek-R1-0528-MXFP4-v2
@@ -1280,16 +1386,16 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
     osl: 1024
     search-space:
     # MTP configurations
-    # 4*DEP4 + 1*DEP8
+    # 2*DEP8 + 1*DEP8
     - spec-decoding: "mtp"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
+        num-worker: 2
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=4"
+        - "PREFILL_NODES=2"
       decode:
         num-worker: 1
         tp: 8
@@ -1299,8 +1405,23 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+
+dsr1-fp4-mi355x-sglang-disagg-exp:
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # non-MTP configurations
     # 2*DEP8 + 1*DEP8
-    - spec-decoding: "mtp"
+    - spec-decoding: "none"
       conc-list: [ 512, 1024, 2048 ]
       prefill:
         num-worker: 2
@@ -1316,4 +1437,4 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
         dp-attn: true
         additional-settings:
         - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
\ No newline at end of file
+        - "DECODE_MTP_SIZE=0"
\ No newline at end of file
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index c547b1174..eefc93920 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -226,7 +226,7 @@ DeepSeek-R1-0528-MXFP4:
 DeepSeek-R1-0528-MXFP4-v2:
   base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
-  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 --num-continuous-decode-steps=4  --scheduler-recv-interval=4 "
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 "
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true

From 5144ca12507ebc6aceddde3afe48cdfb1d2953f3 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 24 Apr 2026 06:58:53 +0000
Subject: [PATCH 44/55] add exp configs

---
 .github/configs/amd-master.yaml | 42 +++++++++++++++++++++++++++++++--
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 8b9d7a594..ae40a0633 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1373,7 +1373,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0424
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1405,9 +1405,28 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 512, 1024, 2048 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
 
 dsr1-fp4-mi355x-sglang-disagg-exp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0424
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1430,6 +1449,25 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
         dp-attn: true
         additional-settings:
         - "PREFILL_NODES=2"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+    # 4*DEP4 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 512, 1024, 2048 ]
+      prefill:
+        num-worker: 4
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=4"
       decode:
         num-worker: 1
         tp: 8

From d9e2eefa0a6d17d9193654e8289a8552bd985a29 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Tue, 28 Apr 2026 16:30:00 +0000
Subject: [PATCH 45/55] bump image

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index ae40a0633..ed165452d 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0417
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg

From ee33925882723f97bafea7ebadda6564e8cd36b5 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Thu, 30 Apr 2026 10:43:33 +0000
Subject: [PATCH 46/55] sync arguments

---
 .github/configs/amd-master.yaml             | 271 +-------------------
 benchmarks/multi_node/amd_utils/env.sh      |  12 +-
 benchmarks/multi_node/amd_utils/models.yaml |   4 +-
 3 files changed, 9 insertions(+), 278 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index ed165452d..887c81a58 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -886,7 +886,7 @@ dsr1-fp4-mi355x-sglang-disagg:
     
     # 1*DEP4+ 1*DEP8
     - spec-decoding: "none"
-      conc-list: [ 1024, 2048 ]
+      conc-list: [ 1024, 2048, 4096 ]
       prefill:
         num-worker: 1
         tp: 4
@@ -985,7 +985,7 @@ dsr1-fp4-mi355x-sglang-disagg:
 
     # 4*DEP4 + 1*DEP8
     - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048 ]
+      conc-list: [  1024, 2048 ]
       prefill:
         num-worker: 4
         tp: 4
@@ -1095,7 +1095,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
 
     # 1*DEP4+ 1*DEP8
     - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048 ]
+      conc-list: [ 1024, 2048, 4096 ]
       prefill:
         num-worker: 1
         tp: 4
@@ -1196,218 +1196,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
 
     # 4*DEP4 + 1*DEP8
     - spec-decoding: "mtp"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-dsr1-fp4-mi355x-sglang-disagg-exp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # non-MTP configurations
-    # 4*DEP4 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    # 2*DEP8 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-
-# dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
-#   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
-#   model: amd/DeepSeek-R1-0528-MXFP4-v2
-#   model-prefix: dsr1
-#   runner: mi355x-disagg
-#   precision: fp4
-#   framework: sglang-disagg
-#   multinode: true
-#   disagg: true
-#   seq-len-configs:
-#   - isl: 8192
-#     osl: 1024
-#     search-space:
-#     # MTP configurations
-#     # 4*DEP4 + 1*DEP8
-#     - spec-decoding: "mtp"
-#       conc-list: [ 512, 1024, 2048 ]
-#       prefill:
-#         num-worker: 4
-#         tp: 4
-#         ep: 4
-#         dp-attn: true
-#         additional-settings:
-#         - "PREFILL_NODES=4"
-#       decode:
-#         num-worker: 1
-#         tp: 8
-#         ep: 8
-#         dp-attn: true
-#         additional-settings:
-#         - "DECODE_NODES=1"
-#         - "DECODE_MTP_SIZE=1"
-
-#     # 2*DEP8 + 1*DEP8
-#     - spec-decoding: "mtp"
-#       conc-list: [ 512, 1024, 2048 ]
-#       prefill:
-#         num-worker: 2
-#         tp: 8
-#         ep: 8
-#         dp-attn: true
-#         additional-settings:
-#         - "PREFILL_NODES=2"
-#       decode:
-#         num-worker: 1
-#         tp: 8
-#         ep: 8
-#         dp-attn: true
-#         additional-settings:
-#         - "DECODE_NODES=1"
-#         - "DECODE_MTP_SIZE=1"
-
-
-# dsr1-fp4-mi355x-sglang-disagg-exp:
-#   image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0411
-#   model: amd/DeepSeek-R1-0528-MXFP4-v2
-#   model-prefix: dsr1
-#   runner: mi355x-disagg
-#   precision: fp4
-#   framework: sglang-disagg
-#   multinode: true
-#   disagg: true
-#   seq-len-configs:
-#   - isl: 8192
-#     osl: 1024
-#     search-space:
-#     # non-MTP configurations
-#     # 4*DEP4 + 1*DEP8
-#     - spec-decoding: "none"
-#       conc-list: [ 512, 1024, 2048 ]
-#       prefill:
-#         num-worker: 4
-#         tp: 4
-#         ep: 4
-#         dp-attn: true
-#         additional-settings:
-#         - "PREFILL_NODES=4"
-#       decode:
-#         num-worker: 1
-#         tp: 8
-#         ep: 8
-#         dp-attn: true
-#         additional-settings:
-#         - "DECODE_NODES=1"
-#         - "DECODE_MTP_SIZE=0"
-
-#     # 2*DEP8 + 1*DEP8
-#     - spec-decoding: "none"
-#       conc-list: [ 512, 1024, 2048 ]
-#       prefill:
-#         num-worker: 2
-#         tp: 8
-#         ep: 8
-#         dp-attn: true
-#         additional-settings:
-#         - "PREFILL_NODES=2"
-#       decode:
-#         num-worker: 1
-#         tp: 8
-#         ep: 8
-#         dp-attn: true
-#         additional-settings:
-#         - "DECODE_NODES=1"
-#         - "DECODE_MTP_SIZE=0"
-
-
-dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0424
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # MTP configurations
-    # 2*DEP8 + 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
-
-    # 4*DEP4 + 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024, 2048 ]
+      conc-list: [ 1024, 2048 ]
       prefill:
         num-worker: 4
         tp: 4
@@ -1424,55 +1213,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
-
-dsr1-fp4-mi355x-sglang-disagg-exp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0424
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # non-MTP configurations
-    # 2*DEP8 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-    # 4*DEP4 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048 ]
-      prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=4"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
\ No newline at end of file
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index c751078ec..9cc96738b 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -41,8 +41,9 @@ export SGLANG_USE_AITER=1
 
 export SGLANG_MORI_DISPATCH_DTYPE=auto
 export SGLANG_MORI_FP8_COMB=true
-export SGLANG_MORI_QP_PER_TRANSFER=2
-export SGLANG_MORI_NUM_WORKERS=2
+export SGLANG_MORI_QP_PER_TRANSFER=4
+export SGLANG_MORI_NUM_WORKERS=4
+export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000
 
 export MORI_IO_QP_MAX_SEND_WR=16384
 export MORI_IO_QP_MAX_CQE=32768 
@@ -61,13 +62,6 @@ export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
 export SGLANG_LOG_MS=true
 export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
 
-
-# Per-role dispatch token limits (prefill uses higher throughput, decode uses lower)
-# export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384
-# if [[ "$MODEL_NAME" == *mxfp4* || "$MODEL_NAME" == *MXFP4* ]]; then
-#     export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288
-# fi
-
 export MORI_MAX_DISPATCH_TOKENS_PREFILL=5120
 export MORI_MAX_DISPATCH_TOKENS_DECODE=256
 export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2048
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index eefc93920..6ed51fc41 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -242,9 +242,9 @@ DeepSeek-R1-0528-MXFP4-v2:
     mem_fraction_static: 0.85
     prefill_round_robin_balance: true
     dp:
-      max_running_requests: 2048
+      max_running_requests: 4096
       chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
-      cuda_graph_bs_range: "1-256"
+      cuda_graph_bs_range: "1-512"
     ep_only:
       max_running_requests: 256
       chunked_prefill_size: 262144

From 2b1ff6b5d15e98103ff0e6ef272fad76b195c02c Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Thu, 30 Apr 2026 10:57:14 +0000
Subject: [PATCH 47/55] fix

---
 .github/configs/amd-master.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 887c81a58..cd11bb7d5 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -486,7 +486,7 @@ dsr1-fp8-mi355x-atom-mtp:
     - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp  }
 
 dsr1-fp8-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -641,7 +641,7 @@ dsr1-fp8-mi355x-sglang-disagg:
 
 
 dsr1-fp8-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0327
+  image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2
   model: deepseek-ai/DeepSeek-R1-0528
   model-prefix: dsr1
   runner: mi355x-disagg

From 05487731866203b099ce155831791cdd1084f330 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 1 May 2026 03:53:01 +0000
Subject: [PATCH 48/55] fix config

---
 .github/configs/amd-master.yaml             | 113 ++++++++++++++++++++
 benchmarks/multi_node/amd_utils/env.sh      |   8 +-
 benchmarks/multi_node/amd_utils/models.yaml |   5 +-
 benchmarks/multi_node/amd_utils/server.sh   |  22 +++-
 4 files changed, 142 insertions(+), 6 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index cd11bb7d5..cfa23a210 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1213,3 +1213,116 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
 
+dsr1-fp4-mi355x-sglang-disagg-exp:
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 1*DEP4+ 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [ 1024, 2048, 4096 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # non-MTP configurations
+    # 2*DEP8 + 1*DEP8
+    - spec-decoding: "none"
+      conc-list: [  1024, 2048 ]
+      prefill:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=2"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=0"
+
+
+dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
+  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428
+  model: amd/DeepSeek-R1-0528-MXFP4-v2
+  model-prefix: dsr1
+  runner: mi355x-disagg
+  precision: fp4
+  framework: sglang-disagg
+  multinode: true
+  disagg: true
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 1*DEP4+ 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1024, 2048, 4096 ]
+      prefill:
+        num-worker: 1
+        tp: 4
+        ep: 4
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=1"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
+
+
+  - isl: 8192
+    osl: 1024
+    search-space:
+    # MTP configurations
+    # 2*DEP8 + 1*DEP8
+    - spec-decoding: "mtp"
+      conc-list: [ 1024, 2048 ]
+      prefill:
+        num-worker: 2
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "PREFILL_NODES=2"
+      decode:
+        num-worker: 1
+        tp: 8
+        ep: 8
+        dp-attn: true
+        additional-settings:
+        - "DECODE_NODES=1"
+        - "DECODE_MTP_SIZE=1"
\ No newline at end of file
diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 9cc96738b..472e9b0de 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -62,9 +62,11 @@ export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
 export SGLANG_LOG_MS=true
 export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32
 
-export MORI_MAX_DISPATCH_TOKENS_PREFILL=5120
-export MORI_MAX_DISPATCH_TOKENS_DECODE=256
-export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2048
+export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192
+export MORI_MAX_DISPATCH_TOKENS_DECODE=512
+
+export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768
+export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703
 
 # set MTP size=1 when EP16
 export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index 6ed51fc41..fbe60d0ec 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -231,9 +231,12 @@ DeepSeek-R1-0528-MXFP4-v2:
     mem_fraction_static: 0.8
     disable_radix_cache: true
     dp:
-      max_running_requests: 32
+      max_running_requests: 4096
       chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
       cuda_graph_bs: "1 2 3"
+      context_length: 9217
+      max_total_tokens: 131072
+      enable_two_batch_overlap: true
     no_dp:
       max_running_requests: 128
       chunked_prefill_size: 16384
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index c7ab4d4ac..e6c24909b 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -127,6 +127,9 @@ no_dp = prefill.get('no_dp', {})
 print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"')
 print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
 print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"')
+print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"')
+print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"')
+print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"')
 print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
 print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
 s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
@@ -169,10 +172,16 @@ if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then
     prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP)
     prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP
     prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP
+    prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP
+    prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP
+    prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP
 else
     prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END))
     prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP
     prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP
+    prefill_context_length=""
+    prefill_max_total_tokens=""
+    prefill_enable_two_batch_overlap="false"
 fi
 
 # Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp)
@@ -192,6 +201,15 @@ PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-r
 if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
     PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
 fi
+if [[ -n "$prefill_context_length" ]]; then
+    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}"
+fi
+if [[ -n "$prefill_max_total_tokens" ]]; then
+    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}"
+fi
+if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then
+    PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap"
+fi
 
 DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} "
 
@@ -361,7 +379,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
     echo "================================================"
 
     # start the head prefill server
-    PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+    PREFILL_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
         --model-path $MODEL_DIR/$MODEL_NAME \
         --disaggregation-mode prefill \
         --disaggregation-ib-device ${IBDEVICES} \
@@ -490,7 +508,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
     echo "Using prefill config: $PREFILL_SERVER_CONFIG"
     echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}"
 
-    PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+    PREFILL_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
         --model-path $MODEL_DIR/${MODEL_NAME} \
         --disaggregation-mode prefill \
         --disaggregation-ib-device ${IBDEVICES} \

From 724bd61a824e7d69b2ffd42de3d3f7aef6d05ec4 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 1 May 2026 06:12:54 +0000
Subject: [PATCH 49/55] add exp configs

---
 .github/configs/amd-master.yaml | 92 ++++++++++++++++-----------------
 1 file changed, 46 insertions(+), 46 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index cfa23a210..a5688b92f 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1223,28 +1223,28 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
   multinode: true
   disagg: true
   seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    # non-MTP configurations
-    # 1*DEP4+ 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 1024, 2048, 4096 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
+  # - isl: 1024
+  #   osl: 1024
+  #   search-space:
+  #   # non-MTP configurations
+  #   # 1*DEP4+ 1*DEP8
+  #   - spec-decoding: "none"
+  #     conc-list: [ 1024, 2048, 4096 ]
+  #     prefill:
+  #       num-worker: 1
+  #       tp: 4
+  #       ep: 4
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "PREFILL_NODES=1"
+  #     decode:
+  #       num-worker: 1
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "DECODE_NODES=1"
+  #       - "DECODE_MTP_SIZE=0"
 
   - isl: 8192
     osl: 1024
@@ -1252,7 +1252,7 @@ dsr1-fp4-mi355x-sglang-disagg-exp:
     # non-MTP configurations
     # 2*DEP8 + 1*DEP8
     - spec-decoding: "none"
-      conc-list: [  1024, 2048 ]
+      conc-list: [ 512, 1024, 2048, 4096 ]
       prefill:
         num-worker: 2
         tp: 8
@@ -1280,28 +1280,28 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
   multinode: true
   disagg: true
   seq-len-configs:
-  - isl: 1024
-    osl: 1024
-    search-space:
-    # MTP configurations
-    # 1*DEP4+ 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048, 4096 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
+  # - isl: 1024
+  #   osl: 1024
+  #   search-space:
+  #   # MTP configurations
+  #   # 1*DEP4+ 1*DEP8
+  #   - spec-decoding: "mtp"
+  #     conc-list: [ 1024, 2048, 4096 ]
+  #     prefill:
+  #       num-worker: 1
+  #       tp: 4
+  #       ep: 4
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "PREFILL_NODES=1"
+  #     decode:
+  #       num-worker: 1
+  #       tp: 8
+  #       ep: 8
+  #       dp-attn: true
+  #       additional-settings:
+  #       - "DECODE_NODES=1"
+  #       - "DECODE_MTP_SIZE=1"
 
 
   - isl: 8192
@@ -1310,7 +1310,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
     # MTP configurations
     # 2*DEP8 + 1*DEP8
     - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048 ]
+      conc-list: [ 512, 1024, 2048, 4096 ]
       prefill:
         num-worker: 2
         tp: 8

From f8f0a3a0fd606b45a7a405535858a36e8b1fc2b2 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 1 May 2026 06:46:17 +0000
Subject: [PATCH 50/55] enable sdma

---
 benchmarks/multi_node/amd_utils/env.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index 472e9b0de..cb094d1e3 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -49,12 +49,17 @@ export MORI_IO_QP_MAX_SEND_WR=16384
 export MORI_IO_QP_MAX_CQE=32768 
 export MORI_IO_QP_MAX_SGE=4
 
+export MORI_IO_TC_DISABLE=0
+
 export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600
 export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600
 
 # Disable allocating memory in one pass
 export MORI_SHMEM_MODE=ISOLATION
 
+# Enable SDMA
+export MORI_ENABLE_SDMA=true
+
 # Enable spec v2 
 export SGLANG_ENABLE_SPEC_V2=1
 export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1

From feb6c7d327e8f55269cf2c1d71fec9b4b2dc2133 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 1 May 2026 08:19:39 +0000
Subject: [PATCH 51/55] fix

---
 benchmarks/multi_node/amd_utils/env.sh    | 3 ---
 benchmarks/multi_node/amd_utils/server.sh | 1 +
 2 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/env.sh b/benchmarks/multi_node/amd_utils/env.sh
index cb094d1e3..d0b99eddc 100755
--- a/benchmarks/multi_node/amd_utils/env.sh
+++ b/benchmarks/multi_node/amd_utils/env.sh
@@ -57,9 +57,6 @@ export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600
 # Disable allocating memory in one pass
 export MORI_SHMEM_MODE=ISOLATION
 
-# Enable SDMA
-export MORI_ENABLE_SDMA=true
-
 # Enable spec v2 
 export SGLANG_ENABLE_SPEC_V2=1
 export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index e6c24909b..537c0812e 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -209,6 +209,7 @@ if [[ -n "$prefill_max_total_tokens" ]]; then
 fi
 if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then
     PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap"
+    export MORI_ENABLE_SDMA=true
 fi
 
 DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} "

From f501a3e110cb364d7ef4eea54e0870356e080af1 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 1 May 2026 08:33:16 +0000
Subject: [PATCH 52/55] fix

---
 benchmarks/multi_node/amd_utils/server.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 537c0812e..89d0f223a 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -209,7 +209,7 @@ if [[ -n "$prefill_max_total_tokens" ]]; then
 fi
 if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then
     PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap"
-    export MORI_ENABLE_SDMA=true
+    PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true"
 fi
 
 DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} "
@@ -380,7 +380,7 @@ if [ "$NODE_RANK" -eq 0 ]; then
     echo "================================================"
 
     # start the head prefill server
-    PREFILL_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+    PREFILL_CMD="${PREFILL_SDMA_ENV} SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
         --model-path $MODEL_DIR/$MODEL_NAME \
         --disaggregation-mode prefill \
         --disaggregation-ib-device ${IBDEVICES} \
@@ -509,7 +509,7 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
     echo "Using prefill config: $PREFILL_SERVER_CONFIG"
     echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}"
 
-    PREFILL_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
+    PREFILL_CMD="${PREFILL_SDMA_ENV} SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
         --model-path $MODEL_DIR/${MODEL_NAME} \
         --disaggregation-mode prefill \
         --disaggregation-ib-device ${IBDEVICES} \

From 217d89277427627d7c76cc5e9932174d7932d221 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Fri, 1 May 2026 14:23:54 +0000
Subject: [PATCH 53/55] cleanup

---
 .github/configs/amd-master.yaml             | 24 ++++++++++-----------
 benchmarks/multi_node/amd_utils/models.yaml | 14 ++++++------
 2 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index a5688b92f..b98d1bb97 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -983,16 +983,16 @@ dsr1-fp4-mi355x-sglang-disagg:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=0"
 
-    # 4*DEP4 + 1*DEP8
+    # 2*DEP8 + 1*DEP8
     - spec-decoding: "none"
-      conc-list: [  1024, 2048 ]
+      conc-list: [ 1024, 2048, 4096 ]
       prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
+        num-worker: 2
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=4"
+        - "PREFILL_NODES=2"
       decode:
         num-worker: 1
         tp: 8
@@ -1194,16 +1194,16 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         - "DECODE_NODES=2"
         - "DECODE_MTP_SIZE=1"
 
-    # 4*DEP4 + 1*DEP8
+    # 2*DEP8 + 1*DEP8
     - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048 ]
+      conc-list: [ 1024, 2048, 4096 ]
       prefill:
-        num-worker: 4
-        tp: 4
-        ep: 4
+        num-worker: 2
+        tp: 8
+        ep: 8
         dp-attn: true
         additional-settings:
-        - "PREFILL_NODES=4"
+        - "PREFILL_NODES=2"
       decode:
         num-worker: 1
         tp: 8
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
index fbe60d0ec..436c32d27 100644
--- a/benchmarks/multi_node/amd_utils/models.yaml
+++ b/benchmarks/multi_node/amd_utils/models.yaml
@@ -38,7 +38,7 @@
 #         cuda_graph_bs_range: str
 
 DeepSeek-V3:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -69,7 +69,7 @@ DeepSeek-V3:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-V3-0324:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -100,7 +100,7 @@ DeepSeek-V3-0324:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -131,7 +131,7 @@ DeepSeek-R1:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -162,7 +162,7 @@ DeepSeek-R1-0528:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4-Preview:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -193,7 +193,7 @@ DeepSeek-R1-0528-MXFP4-Preview:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
@@ -224,7 +224,7 @@ DeepSeek-R1-0528-MXFP4:
       cuda_graph_bs_range: "1-128"
 
 DeepSeek-R1-0528-MXFP4-v2:
-  base_flags: "--decode-log-interval 100 --log-level info --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
   dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 "
   prefill:

From a5a822a122d466bda675ec6db1b10194b5731564 Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sat, 2 May 2026 00:57:12 +0000
Subject: [PATCH 54/55] bump image

---
 .github/configs/amd-master.yaml | 118 +-------------------------------
 perf-changelog.yaml             |   9 +--
 2 files changed, 7 insertions(+), 120 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index b98d1bb97..866642fff 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -795,7 +795,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
         - "DECODE_MTP_SIZE=2"
 
 dsr1-fp4-mi355x-sglang-disagg:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428
+  image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1004,7 +1004,7 @@ dsr1-fp4-mi355x-sglang-disagg:
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428
+  image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
   model: amd/DeepSeek-R1-0528-MXFP4-v2
   model-prefix: dsr1
   runner: mi355x-disagg
@@ -1212,117 +1212,3 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
         additional-settings:
         - "DECODE_NODES=1"
         - "DECODE_MTP_SIZE=1"
-
-dsr1-fp4-mi355x-sglang-disagg-exp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  # - isl: 1024
-  #   osl: 1024
-  #   search-space:
-  #   # non-MTP configurations
-  #   # 1*DEP4+ 1*DEP8
-  #   - spec-decoding: "none"
-  #     conc-list: [ 1024, 2048, 4096 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 4
-  #       ep: 4
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=1"
-  #       - "DECODE_MTP_SIZE=0"
-
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # non-MTP configurations
-    # 2*DEP8 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 512, 1024, 2048, 4096 ]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
-
-
-dsr1-fp4-mi355x-sglang-disagg-mtp-exp:
-  image: rocm/sgl-dev:sglang-0.5.10-rocm720-mi35x-mori-0428
-  model: amd/DeepSeek-R1-0528-MXFP4-v2
-  model-prefix: dsr1
-  runner: mi355x-disagg
-  precision: fp4
-  framework: sglang-disagg
-  multinode: true
-  disagg: true
-  seq-len-configs:
-  # - isl: 1024
-  #   osl: 1024
-  #   search-space:
-  #   # MTP configurations
-  #   # 1*DEP4+ 1*DEP8
-  #   - spec-decoding: "mtp"
-  #     conc-list: [ 1024, 2048, 4096 ]
-  #     prefill:
-  #       num-worker: 1
-  #       tp: 4
-  #       ep: 4
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "PREFILL_NODES=1"
-  #     decode:
-  #       num-worker: 1
-  #       tp: 8
-  #       ep: 8
-  #       dp-attn: true
-  #       additional-settings:
-  #       - "DECODE_NODES=1"
-  #       - "DECODE_MTP_SIZE=1"
-
-
-  - isl: 8192
-    osl: 1024
-    search-space:
-    # MTP configurations
-    # 2*DEP8 + 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 512, 1024, 2048, 4096 ]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
\ No newline at end of file
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 1cd22211a..ffd2a64e1 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1218,8 +1218,9 @@
     - dsr1-fp4-mi355x-sglang-disagg
     - dsr1-fp4-mi355x-sglang-disagg-mtp
   description:
-    - "Bump SGL mori image to March 27"
-    - "Add more low latency sweep configs"
+    - "Bump SGL mori image to lmsysorg/sglang-rocm"
+    - "Add more high tput / low latency sweep configs"
     - "Enable v2 mxfp4 DSR1 0528 model"
-    - "Enable fp4 disp feature on mori"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/983
+    - "Enable fp4 disp / fp8 combine feature on mori"
+    - "Enable Mori SDMA + two batch overlapping feature"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1236

From f570ea7ae03f6dc9fff46cf4d45741321de1588f Mon Sep 17 00:00:00 2001
From: billishyahao <bill.he@amd.com>
Date: Sat, 2 May 2026 01:10:57 +0000
Subject: [PATCH 55/55] fix yaml

---
 .github/configs/amd-master.yaml           | 144 +++++++++++-----------
 benchmarks/multi_node/amd_utils/server.sh |   5 +
 2 files changed, 77 insertions(+), 72 deletions(-)

diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml
index 727bfc0b9..b1a9b1227 100644
--- a/.github/configs/amd-master.yaml
+++ b/.github/configs/amd-master.yaml
@@ -1238,24 +1238,24 @@ dsr1-fp4-mi355x-sglang-disagg:
           - "DECODE_NODES=2"
           - "DECODE_MTP_SIZE=0"
     
-    # 1*DEP4+ 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 1024, 2048, 4096 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
+      # 1*DEP4+ 1*DEP8
+      - spec-decoding: "none"
+        conc-list: [ 1024, 2048, 4096 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
 
     - isl: 8192
       osl: 1024
@@ -1337,24 +1337,24 @@ dsr1-fp4-mi355x-sglang-disagg:
           - "DECODE_NODES=2"
           - "DECODE_MTP_SIZE=0"
 
-    # 2*DEP8 + 1*DEP8
-    - spec-decoding: "none"
-      conc-list: [ 1024, 2048, 4096 ]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=0"
+      # 2*DEP8 + 1*DEP8
+      - spec-decoding: "none"
+        conc-list: [ 1024, 2048, 4096 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=0"
 
 
 dsr1-fp4-mi355x-sglang-disagg-mtp:
@@ -1448,24 +1448,24 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
           - "DECODE_NODES=2"
           - "DECODE_MTP_SIZE=1"
 
-    # 1*DEP4+ 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048, 4096 ]
-      prefill:
-        num-worker: 1
-        tp: 4
-        ep: 4
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=1"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
+      # 1*DEP4+ 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 1024, 2048, 4096 ]
+        prefill:
+          num-worker: 1
+          tp: 4
+          ep: 4
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=1"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=1"
 
 
     - isl: 8192
@@ -1549,24 +1549,24 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
           - "DECODE_NODES=2"
           - "DECODE_MTP_SIZE=1"
 
-    # 2*DEP8 + 1*DEP8
-    - spec-decoding: "mtp"
-      conc-list: [ 1024, 2048, 4096 ]
-      prefill:
-        num-worker: 2
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "PREFILL_NODES=2"
-      decode:
-        num-worker: 1
-        tp: 8
-        ep: 8
-        dp-attn: true
-        additional-settings:
-        - "DECODE_NODES=1"
-        - "DECODE_MTP_SIZE=1"
+      # 2*DEP8 + 1*DEP8
+      - spec-decoding: "mtp"
+        conc-list: [ 1024, 2048, 4096 ]
+        prefill:
+          num-worker: 2
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "PREFILL_NODES=2"
+        decode:
+          num-worker: 1
+          tp: 8
+          ep: 8
+          dp-attn: true
+          additional-settings:
+          - "DECODE_NODES=1"
+          - "DECODE_MTP_SIZE=1"
       
 
 dsv4-fp8-mi355x-sglang:
diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
index 63c1d3c48..4da9b56eb 100755
--- a/benchmarks/multi_node/amd_utils/server.sh
+++ b/benchmarks/multi_node/amd_utils/server.sh
@@ -338,6 +338,11 @@ if [[ -n "$MODEL_NAME" ]]; then
     echo "Using model-specific configuration for: $MODEL_NAME"
 fi
 
+if [[ "${EVAL_ONLY:-false}" == "true" ]] || [[ "${RUN_EVAL:-false}" == "true" ]]; then
+    PREFILL_SERVER_CONFIG=$(echo "$PREFILL_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
+    DECODE_SERVER_CONFIG=$(echo "$DECODE_SERVER_CONFIG" | sed 's/--ep-dispatch-algorithm fake//g')
+fi
+
 # =============================================================================
 # Container Synchronization
 # =============================================================================