SemiAnalysisAI · chunfangamd · Mar 3, 2026 · Feb 24, 2026 · Feb 24, 2026 · Feb 24, 2026
diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml
@@ -40,7 +40,7 @@
 DeepSeek-V3:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true
@@ -71,7 +71,7 @@ DeepSeek-V3:
 DeepSeek-V3-0324:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true
@@ -102,7 +102,38 @@ DeepSeek-V3-0324:
 DeepSeek-R1:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+
+DeepSeek-R1-0528:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
+  mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true
@@ -133,7 +164,38 @@ DeepSeek-R1:
 DeepSeek-R1-0528-MXFP4-Preview:
   base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter"
   mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
-  dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
+  prefill:
+    mem_fraction_static: 0.8
+    disable_radix_cache: true
+    dp:
+      max_running_requests: 24
+      chunked_prefill_size: 16384
+      cuda_graph_bs: "1 2 3"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 16384
+      cuda_graph_bs_range: "1-128"
+  decode:
+    mem_fraction_static: 0.85
+    prefill_round_robin_balance: true
+    dp:
+      max_running_requests: 4096
+      chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
+      cuda_graph_bs_range: "1-160"
+    ep_only:
+      max_running_requests: 256
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-256"
+    no_dp:
+      max_running_requests: 128
+      chunked_prefill_size: 262144
+      cuda_graph_bs_range: "1-128"
+
+DeepSeek-R1-0528-MXFP4:
+  base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter"
+  mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
+  dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
   prefill:
     mem_fraction_static: 0.8
     disable_radix_cache: true

diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh
@@ -220,8 +220,9 @@ fi
 # =============================================================================
 IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"
 
-PREFILL_NODES_PER_WORKER=$((PREFILL_TP_SIZE / GPUS_PER_NODE))
-DECODE_NODES_PER_WORKER=$((DECODE_TP_SIZE / GPUS_PER_NODE))
+# Ceiling division by GPUS_PER_NODE for nodes-per-worker
+PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE))
+DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE))
 NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP))
 
 # Build prefill arguments dynamically based on xP

diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh
@@ -70,8 +70,10 @@ PREFILL_ENABLE_EP=${9:-1}
 PREFILL_ENABLE_DP=${10:-1}
 DECODE_ENABLE_EP=${11:-1}
 DECODE_ENABLE_DP=${12:-1}
-RANDOM_RANGE_RATIO=${13}
-NODE_LIST=${14}
+PREFILL_TP=${13:-8}
+DECODE_TP=${14:-8}
+RANDOM_RANGE_RATIO=${15}
+NODE_LIST=${16}
 
 
 NUM_NODES=$((PREFILL_NODES + DECODE_NODES))
@@ -89,10 +91,10 @@ export yD=$DECODE_WORKERS
 export NUM_NODES=$NUM_NODES
 export GPUS_PER_NODE=$GPUS_PER_NODE
 export MODEL_NAME=$MODEL_NAME
-export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $GPUS_PER_NODE / $PREFILL_WORKERS ))
+export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS ))
 export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP}
 export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP}
-export DECODE_TP_SIZE=$(( $DECODE_NODES * $GPUS_PER_NODE / $DECODE_WORKERS ))
+export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS ))
 export DECODE_ENABLE_EP=${DECODE_ENABLE_EP}
 export DECODE_ENABLE_DP=${DECODE_ENABLE_DP}
 export DECODE_MTP_SIZE=${DECODE_MTP_SIZE}

diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
@@ -33,17 +33,27 @@ cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
 # Set up SGL launch script-specific environment variables
 export TIME_LIMIT="08:00:00"
 export MODEL_PATH=$MODEL_PATH
-export MODEL_NAME="DeepSeek-R1-0528-MXFP4-Preview"
+export MODEL_NAME=$MODEL_NAME
 export CONTAINER_IMAGE=$IMAGE
 
+if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
+export PREFILL_ENABLE_EP=false
+else
 export PREFILL_ENABLE_EP=true
+fi
+
 if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
 export PREFILL_ENABLE_DP=true
 else
 export PREFILL_ENABLE_DP=false
 fi
 
+if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
+export DECODE_ENABLE_EP=false
+else
 export DECODE_ENABLE_EP=true
+fi
+
 if [[ "$DECODE_DP_ATTN" == "true" ]]; then
 export DECODE_ENABLE_DP=true
 else
@@ -61,6 +71,7 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
     $ISL $OSL "${CONC_LIST// /x}" inf \
     ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
     ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
     ${RANDOM_RANGE_RATIO})
 
 if [[ $? -ne 0 ]]; then

diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
@@ -33,7 +33,7 @@ cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
 # Set up SGL launch script-specific environment variables
 export TIME_LIMIT="08:00:00"
 export MODEL_PATH=$MODEL_PATH
-export MODEL_NAME="DeepSeek-R1"
+export MODEL_NAME=$MODEL_NAME
 export CONTAINER_IMAGE=$IMAGE
 
 if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
@@ -71,6 +71,7 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
     $ISL $OSL "${CONC_LIST// /x}" inf \
     ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
     ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
+    ${PREFILL_TP} ${DECODE_TP} \
     ${RANDOM_RANGE_RATIO})
 
 if [[ $? -ne 0 ]]; then

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -835,6 +835,18 @@
     - "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/839
 
+- config-keys:
+    - dsr1-fp8-mi355x-sglang-disagg
+    - dsr1-fp8-mi355x-sglang-disagg-mtp
+    - dsr1-fp4-mi355x-sglang-disagg
+    - dsr1-fp4-mi355x-sglang-disagg-mtp
+  description:
+    - "Add more sweep configs for MI355X FP8/FP4 Disagg"
+    - "Add TP/DP/EP size < 8 support "
+    - "Support DSR1-0528 MTP Disagg"
+    - "Bump SGL mori image to Feb 27"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823
+
 - config-keys:
     - minimaxm2.5-fp8-h100-vllm
   description:

diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh
@@ -34,7 +34,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
     export SLURM_PARTITION="compute"
     export SLURM_JOB_NAME="benchmark-sglang-disagg.job"
 
-    export MODEL_NAME="DeepSeek-R1"
+    export MODEL_NAME=${MODEL##*/}
     export MODEL_PATH="/it-share/data"
     export IBDEVICES="rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7"
     export MORI_RDMA_TC=104