From b7b85df738e94e84ce3264d4b147b9cceaddfc3e Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Tue, 24 Feb 2026 08:46:51 +0000 Subject: [PATCH 01/17] Update Image for DSR1 FP4 MI355X SGLang Disagg New image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2 Goal: solve the DP Attention FP4 Disagg AMD broken issue --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d7e51cb28..d5536bd1f 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -918,7 +918,7 @@ dsr1-fp8-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.7-rocm700-mi35x-mori-fp4-0122 + image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg From df3f4e0cad549e9291e1b3a82739917db9b3f236 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Tue, 24 Feb 2026 11:55:37 +0000 Subject: [PATCH 02/17] Turn on DP Atttention --- .github/configs/amd-master.yaml | 48 ++++++++++++++++----------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d5536bd1f..ddcb7c0df 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -938,14 +938,14 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 1 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" decode: num-worker: 2 tp: 1 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=1" @@ -957,7 +957,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" @@ -965,7 +965,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 2 tp: 8 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=1" @@ -976,7 +976,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" @@ -984,7 +984,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 2 tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=2" @@ -997,14 +997,14 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 1 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" decode: num-worker: 2 tp: 1 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" @@ -1016,7 +1016,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" @@ -1024,7 +1024,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 2 tp: 8 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" @@ -1035,7 +1035,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" @@ -1043,7 +1043,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 2 tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" @@ -1059,14 +1059,14 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 1 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" decode: num-worker: 2 tp: 1 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=1" @@ -1078,7 +1078,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" @@ -1086,7 +1086,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 2 tp: 8 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=1" @@ -1097,7 +1097,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" @@ -1105,7 +1105,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 2 tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=2" @@ -1118,14 +1118,14 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 1 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" decode: num-worker: 2 tp: 1 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" @@ -1137,7 +1137,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" @@ -1145,7 +1145,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 2 tp: 8 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" @@ -1156,7 +1156,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" @@ -1164,7 +1164,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 2 tp: 8 ep: 1 - dp-attn: false + dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" From 7421f6f71bd241743f930e375d9f8cea962274f3 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Tue, 24 Feb 2026 18:24:46 +0000 Subject: [PATCH 03/17] Turn off dp-attn for prefill --- .github/configs/amd-master.yaml | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index ddcb7c0df..d4dbfe80d 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -938,7 +938,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 1 ep: 8 - dp-attn: true + dp-attn: false additional-settings: - "PREFILL_NODES=1" decode: @@ -957,7 +957,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 8 - dp-attn: true + dp-attn: false additional-settings: - "PREFILL_NODES=1" @@ -976,7 +976,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 1 - dp-attn: true + dp-attn: false additional-settings: - "PREFILL_NODES=1" @@ -997,7 +997,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 1 ep: 8 - dp-attn: true + dp-attn: false additional-settings: - "PREFILL_NODES=1" decode: @@ -1016,7 +1016,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 8 - dp-attn: true + dp-attn: false additional-settings: - "PREFILL_NODES=1" @@ -1035,7 +1035,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 1 - dp-attn: true + dp-attn: false additional-settings: - "PREFILL_NODES=1" @@ -1059,7 +1059,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 1 ep: 8 - dp-attn: true + dp-attn: false additional-settings: - "PREFILL_NODES=1" decode: @@ -1078,7 +1078,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 8 - dp-attn: true + dp-attn: false additional-settings: - "PREFILL_NODES=1" @@ -1097,7 +1097,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 1 - dp-attn: true + dp-attn: false additional-settings: - "PREFILL_NODES=1" @@ -1118,7 +1118,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 1 ep: 8 - dp-attn: true + dp-attn: false additional-settings: - "PREFILL_NODES=1" decode: @@ -1137,7 +1137,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 8 - dp-attn: true + dp-attn: false additional-settings: - "PREFILL_NODES=1" @@ -1156,7 +1156,7 @@ dsr1-fp4-mi355x-sglang-disagg: num-worker: 1 tp: 8 ep: 1 - dp-attn: true + dp-attn: false additional-settings: - "PREFILL_NODES=1" From 5d2f28bd6591a8fc7134581a340928c28970c5bc Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 27 Feb 2026 09:53:11 +0000 Subject: [PATCH 04/17] add fp4 configs --- .github/configs/amd-master.yaml | 227 +++++++++--------- .../dsr1_fp4_mi355x_sglang-disagg.sh | 12 +- .../dsr1_fp8_mi355x_sglang-disagg.sh | 2 +- runners/launch_mi355x-amds.sh | 2 +- 4 files changed, 133 insertions(+), 110 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index d4dbfe80d..c482c0124 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -918,7 +918,7 @@ dsr1-fp8-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2 + image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2-fixmtp model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg @@ -930,48 +930,29 @@ dsr1-fp4-mi355x-sglang-disagg: - isl: 1024 osl: 1024 search-space: - # MTP configurations - # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - - spec-decoding: "mtp" - conc-list: [ 1024, 512, 256 ] + # non-MTP configurations + # 1P1D TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] prefill: num-worker: 1 - tp: 1 - ep: 8 + tp: 8 + ep: 1 dp-attn: false additional-settings: - "PREFILL_NODES=1" decode: - num-worker: 2 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" - - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "mtp" - conc-list: [ 32, 64, 128 ] - prefill: num-worker: 1 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - - "PREFILL_NODES=1" - - decode: - num-worker: 2 - tp: 8 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" - - spec-decoding: "mtp" - conc-list: [ 128, 64, 32, 16, 8, 4 ] + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ] prefill: num-worker: 1 tp: 8 @@ -979,58 +960,60 @@ dsr1-fp4-mi355x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" - decode: num-worker: 2 tp: 8 ep: 1 - dp-attn: true + dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" + - "DECODE_MTP_SIZE=0" - # non-MTP configurations - # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) + # 1P2D DEP8+TEP8 - spec-decoding: "none" - conc-list: [ 1024, 512, 256 ] + conc-list: [ 256, 512 ] prefill: num-worker: 1 - tp: 1 + tp: 8 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" decode: num-worker: 2 - tp: 1 + tp: 8 ep: 8 - dp-attn: true + dp-attn: false additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D pure TP8 - spec-decoding: "none" - conc-list: [ 32, 64, 128 ] + conc-list: [ 1, 2, 4, 8 ] prefill: num-worker: 1 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - "PREFILL_NODES=1" - decode: - num-worker: 2 + num-worker: 1 tp: 8 - ep: 8 - dp-attn: true + ep: 1 + dp-attn: false additional-settings: - - "DECODE_NODES=2" + - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" + # 1P2D TP8 - spec-decoding: "none" - conc-list: [ 128, 64, 32, 16, 8, 4 ] + conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ] prefill: num-worker: 1 tp: 8 @@ -1038,61 +1021,70 @@ dsr1-fp4-mi355x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" - decode: num-worker: 2 tp: 8 ep: 1 - dp-attn: true + dp-attn: false additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - - isl: 8192 - osl: 1024 - search-space: - # MTP configurations - # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - - spec-decoding: "mtp" - conc-list: [ 1024, 512, 256 ] + # 2P1D DEP8+TEP8 + - spec-decoding: "none" + conc-list: [ 256, 512 ] prefill: - num-worker: 1 - tp: 1 + num-worker: 2 + tp: 8 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - - "PREFILL_NODES=1" + - "PREFILL_NODES=2" decode: - num-worker: 2 - tp: 1 + num-worker: 1 + tp: 8 ep: 8 - dp-attn: true + dp-attn: false additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) +dsr1-fp4-mi355x-sglang-disagg-mtp: + image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2-fixmtp + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # MTP configurations + # 1P1D TP8 - spec-decoding: "mtp" - conc-list: [ 32, 64, 128 ] + conc-list: [ 1, 2, 4, 8 ] prefill: num-worker: 1 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - "PREFILL_NODES=1" - decode: - num-worker: 2 + num-worker: 1 tp: 8 - ep: 8 - dp-attn: true + ep: 1 + dp-attn: false additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" - - spec-decoding: "mtp" - conc-list: [ 128, 64, 32, 16, 8, 4 ] + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ] prefill: num-worker: 1 tp: 8 @@ -1100,58 +1092,60 @@ dsr1-fp4-mi355x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" - decode: num-worker: 2 tp: 8 ep: 1 - dp-attn: true + dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" + - "DECODE_MTP_SIZE=3" - # non-MTP configurations - # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - - spec-decoding: "none" - conc-list: [ 1024, 512, 256 ] + # 1P2D DEP8+TEP8 + - spec-decoding: "mtp" + conc-list: [ 256, 512 ] prefill: num-worker: 1 - tp: 1 + tp: 8 ep: 8 - dp-attn: false + dp-attn: true additional-settings: - "PREFILL_NODES=1" decode: num-worker: 2 - tp: 1 + tp: 8 ep: 8 - dp-attn: true + dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + - "DECODE_MTP_SIZE=1" - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "none" - conc-list: [ 32, 64, 128 ] + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 1P1D pure TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] prefill: num-worker: 1 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - "PREFILL_NODES=1" - decode: - num-worker: 2 + num-worker: 1 tp: 8 - ep: 8 - dp-attn: true + ep: 1 + dp-attn: false additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" - - spec-decoding: "none" - conc-list: [ 128, 64, 32, 16, 8, 4 ] + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ] prefill: num-worker: 1 tp: 8 @@ -1159,15 +1153,34 @@ dsr1-fp4-mi355x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" - decode: num-worker: 2 tp: 8 ep: 1 - dp-attn: true + dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + - "DECODE_MTP_SIZE=3" + + # 2P1D DEP8+TEP8 + - spec-decoding: "mtp" + conc-list: [ 256, 512 ] + prefill: + num-worker: 2 + tp: 8 + ep: 8 + dp-attn: true + additional-settings: + - "PREFILL_NODES=2" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=1" + # FIXME(billishyahao): disable FP4 1k8k for now # - isl: 1024 diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh index aaaf97075..f568b2b5d 100644 --- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh @@ -33,17 +33,27 @@ cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 # Set up SGL launch script-specific environment variables export TIME_LIMIT="08:00:00" export MODEL_PATH=$MODEL_PATH -export MODEL_NAME="DeepSeek-R1-0528-MXFP4-Preview" +export MODEL_NAME=$MODEL_NAME export CONTAINER_IMAGE=$IMAGE +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else export PREFILL_ENABLE_EP=true +fi + if [[ "$PREFILL_DP_ATTN" == "true" ]]; then export PREFILL_ENABLE_DP=true else export PREFILL_ENABLE_DP=false fi +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else export DECODE_ENABLE_EP=true +fi + if [[ "$DECODE_DP_ATTN" == "true" ]]; then export DECODE_ENABLE_DP=true else diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh index 0a5a5c30b..9a0ea92e7 100644 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh @@ -33,7 +33,7 @@ cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 # Set up SGL launch script-specific environment variables export TIME_LIMIT="08:00:00" export MODEL_PATH=$MODEL_PATH -export MODEL_NAME="DeepSeek-R1" +export MODEL_NAME=$MODEL_NAME export CONTAINER_IMAGE=$IMAGE if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 2b9902b0b..f4f1e561f 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -34,7 +34,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export SLURM_PARTITION="compute" export SLURM_JOB_NAME="benchmark-sglang-disagg.job" - export MODEL_NAME="DeepSeek-R1" + export MODEL_NAME=${MODEL##*/} export MODEL_PATH="/it-share/data" export IBDEVICES="rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7" export MORI_RDMA_TC=104 From b434f0a138042378b4fc4a70ab1ea4da4c0ec992 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Fri, 27 Feb 2026 10:19:35 +0000 Subject: [PATCH 05/17] add models yaml --- benchmarks/multi_node/amd_utils/models.yaml | 31 +++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index edfebc755..3a66eb46b 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -160,3 +160,34 @@ DeepSeek-R1-0528-MXFP4-Preview: max_running_requests: 128 chunked_prefill_size: 262144 cuda_graph_bs_range: "1-128" + +DeepSeek-R1-0528-MXFP4: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: 16384 + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 16384 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" From 55ef22f6fdae13186a13c272ebf0541bd375c878 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sat, 28 Feb 2026 06:15:23 +0000 Subject: [PATCH 06/17] add more fp8 configs --- .github/configs/amd-master.yaml | 228 ++++++-------------- benchmarks/multi_node/amd_utils/models.yaml | 31 +++ 2 files changed, 98 insertions(+), 161 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index c482c0124..1d9dcea89 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -391,7 +391,7 @@ dsr1-fp8-mi355x-atom-mtp: - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.8-rocm700-mi35x-mori-0210 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg @@ -403,9 +403,9 @@ dsr1-fp8-mi355x-sglang-disagg: - isl: 1024 osl: 1024 search-space: - # MTP configurations - # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) - - spec-decoding: "mtp" + # non-MTP configurations + # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) + - spec-decoding: "none" conc-list: [ 1024, 2048 ] prefill: num-worker: 1 @@ -416,15 +416,15 @@ dsr1-fp8-mi355x-sglang-disagg: - "PREFILL_NODES=1" decode: num-worker: 1 - tp: 1 + tp: 16 ep: 16 dp-attn: true additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" + - "DECODE_MTP_SIZE=0" - # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8) - - spec-decoding: "mtp" + # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8) + - spec-decoding: "none" conc-list: [ 1536, 1024, 512, 256 ] prefill: num-worker: 1 @@ -435,16 +435,16 @@ dsr1-fp8-mi355x-sglang-disagg: - "PREFILL_NODES=1" decode: num-worker: 2 - tp: 1 + tp: 8 ep: 8 dp-attn: true additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" + - "DECODE_MTP_SIZE=0" # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "mtp" + - spec-decoding: "none" conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] prefill: num-worker: 1 @@ -461,13 +461,13 @@ dsr1-fp8-mi355x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" + - "DECODE_MTP_SIZE=0" - - spec-decoding: "mtp" - conc-list: [ 4, 2, 1 ] + - spec-decoding: "none" + conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 - tp: 8 + tp: 4 ep: 1 dp-attn: false additional-settings: @@ -480,49 +480,32 @@ dsr1-fp8-mi355x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=2" + - "DECODE_MTP_SIZE=0" + - isl: 8192 + osl: 1024 + search-space: # non-MTP configurations - # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) + # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) - spec-decoding: "none" conc-list: [ 1024, 2048 ] prefill: - num-worker: 1 + num-worker: 2 tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 1 - ep: 16 + ep: 8 dp-attn: true additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - - spec-decoding: "none" - conc-list: [ 1536, 1024, 512, 256 ] - prefill: + - "PREFILL_NODES=2" + decode: num-worker: 1 tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 1 ep: 8 dp-attn: true additional-settings: - - "DECODE_NODES=2" + - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) + # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) - spec-decoding: "none" conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] prefill: @@ -543,10 +526,10 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" - spec-decoding: "none" - conc-list: [ 4, 2, 1 ] + conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 - tp: 8 + tp: 4 ep: 1 dp-attn: false additional-settings: @@ -561,78 +544,53 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - - isl: 8192 + +dsr1-fp8-mi355x-sglang-disagg-mtp: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 osl: 1024 search-space: # MTP configurations + # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) - spec-decoding: "mtp" conc-list: [ 1024, 2048 ] prefill: - num-worker: 2 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - - spec-decoding: "mtp" - conc-list: [ 512, 1024 ] - prefill: - num-worker: 2 tp: 8 ep: 1 dp-attn: false additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - "PREFILL_NODES=1" decode: num-worker: 1 - tp: 1 + tp: 16 ep: 16 dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=1" - # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8) + # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) - spec-decoding: "mtp" - conc-list: [ 1536, 1024 ] + conc-list: [ 1536, 1024, 512, 256 ] prefill: num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true + tp: 8 + ep: 1 + dp-attn: false additional-settings: - "PREFILL_NODES=1" decode: num-worker: 2 - tp: 1 + tp: 8 ep: 8 dp-attn: true additional-settings: @@ -661,10 +619,10 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=2" - spec-decoding: "mtp" - conc-list: [ 4, 2, 1 ] + conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 - tp: 8 + tp: 4 ep: 1 dp-attn: false additional-settings: @@ -679,83 +637,31 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" - # non-MTP configurations - - spec-decoding: "none" + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) + - spec-decoding: "mtp" conc-list: [ 1024, 2048 ] prefill: num-worker: 2 - tp: 1 + tp: 8 ep: 8 dp-attn: true additional-settings: - "PREFILL_NODES=2" decode: num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - spec-decoding: "none" - conc-list: [ 512, 1024 ] - prefill: - num-worker: 2 tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 1 ep: 8 dp-attn: true additional-settings: - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 1 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - - spec-decoding: "none" - conc-list: [ 1536, 1024 ] - prefill: - num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + - "DECODE_MTP_SIZE=1" - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "none" + # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) + - spec-decoding: "mtp" conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] prefill: num-worker: 1 @@ -772,13 +678,13 @@ dsr1-fp8-mi355x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + - "DECODE_MTP_SIZE=2" - - spec-decoding: "none" - conc-list: [ 4, 2, 1 ] + - spec-decoding: "mtp" + conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 - tp: 8 + tp: 4 ep: 1 dp-attn: false additional-settings: @@ -791,7 +697,7 @@ dsr1-fp8-mi355x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" + - "DECODE_MTP_SIZE=2" # FIXME(billishyahao): disable 1k8k for now # - isl: 1024 @@ -918,7 +824,7 @@ dsr1-fp8-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2-fixmtp + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg @@ -1050,7 +956,7 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" dsr1-fp4-mi355x-sglang-disagg-mtp: - image: rocm/sgl-dev:sglang-0.5.8-rocm720-mi35x-mori-0218-2-fixmtp + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 3a66eb46b..5cfad98db 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -130,6 +130,37 @@ DeepSeek-R1: chunked_prefill_size: 262144 cuda_graph_bs_range: "1-128" +DeepSeek-R1-0528: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + DeepSeek-R1-0528-MXFP4-Preview: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" From 618dab4d74c4e0bb6491f1150a8501a5053c6fcb Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sat, 28 Feb 2026 06:18:19 +0000 Subject: [PATCH 07/17] specify moriep normal mode --- benchmarks/multi_node/amd_utils/models.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index 5cfad98db..5280b6767 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -40,7 +40,7 @@ DeepSeek-V3: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -71,7 +71,7 @@ DeepSeek-V3: DeepSeek-V3-0324: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -102,7 +102,7 @@ DeepSeek-V3-0324: DeepSeek-R1: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -133,7 +133,7 @@ DeepSeek-R1: DeepSeek-R1-0528: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -164,7 +164,7 @@ DeepSeek-R1-0528: DeepSeek-R1-0528-MXFP4-Preview: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -195,7 +195,7 @@ DeepSeek-R1-0528-MXFP4-Preview: DeepSeek-R1-0528-MXFP4: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 disable_radix_cache: true From 08b90f01d8627e2ea11da9f1d1811eac9bcbf6d3 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sat, 28 Feb 2026 09:56:06 +0000 Subject: [PATCH 08/17] add fp4 configs --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 1d9dcea89..91ae17c17 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -1009,7 +1009,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: # 1P2D DEP8+TEP8 - spec-decoding: "mtp" - conc-list: [ 256, 512 ] + conc-list: [ 256, 512, 1024 ] prefill: num-worker: 1 tp: 8 @@ -1070,7 +1070,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: # 2P1D DEP8+TEP8 - spec-decoding: "mtp" - conc-list: [ 256, 512 ] + conc-list: [ 256, 512, 1024 ] prefill: num-worker: 2 tp: 8 From 490645237e7acfef2782c398847862926fd966ee Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sat, 28 Feb 2026 11:06:31 +0000 Subject: [PATCH 09/17] fix fp4 configs --- .github/configs/amd-master.yaml | 84 ++++++++++++++++++++++++--------- 1 file changed, 62 insertions(+), 22 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 91ae17c17..ddaec6e57 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -445,7 +445,7 @@ dsr1-fp8-mi355x-sglang-disagg: # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - spec-decoding: "none" - conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] prefill: num-worker: 1 tp: 8 @@ -464,7 +464,7 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" - spec-decoding: "none" - conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ] + conc-list: [ 2, 1 ] prefill: num-worker: 1 tp: 4 @@ -507,7 +507,7 @@ dsr1-fp8-mi355x-sglang-disagg: # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) - spec-decoding: "none" - conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] prefill: num-worker: 1 tp: 8 @@ -526,7 +526,7 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" - spec-decoding: "none" - conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ] + conc-list: [ 128, 2, 1 ] prefill: num-worker: 1 tp: 4 @@ -600,7 +600,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - spec-decoding: "mtp" - conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] prefill: num-worker: 1 tp: 8 @@ -619,7 +619,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" - spec-decoding: "mtp" - conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ] + conc-list: [ 2, 1 ] prefill: num-worker: 1 tp: 4 @@ -681,7 +681,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" - spec-decoding: "mtp" - conc-list: [ 128, 64, 32, 16, 8, 4, 2, 1 ] + conc-list: [ 4, 2, 1 ] prefill: num-worker: 1 tp: 4 @@ -990,7 +990,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: # 1P2D TP8 - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ] + conc-list: [ 2, 4, 8, 16, 32 ] prefill: num-worker: 1 tp: 8 @@ -1007,25 +1007,45 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=3" - # 1P2D DEP8+TEP8 + # 1P2D TP8 - spec-decoding: "mtp" - conc-list: [ 256, 512, 1024 ] + conc-list: [ 64, 128, 256 ] prefill: num-worker: 1 tp: 8 - ep: 8 - dp-attn: true + ep: 1 + dp-attn: false additional-settings: - "PREFILL_NODES=1" decode: num-worker: 2 tp: 8 - ep: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 dp-attn: false additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=1" + - isl: 8192 osl: 1024 search-space: @@ -1049,9 +1069,10 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=3" + # 1P2D TP8 - spec-decoding: "mtp" - conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ] + conc-list: [ 2, 4, 8, 16, 32 ] prefill: num-worker: 1 tp: 8 @@ -1068,23 +1089,42 @@ dsr1-fp4-mi355x-sglang-disagg-mtp: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=3" - # 2P1D DEP8+TEP8 + # 1P2D TP8 - spec-decoding: "mtp" - conc-list: [ 256, 512, 1024 ] + conc-list: [ 64, 128, 256 ] prefill: - num-worker: 2 + num-worker: 1 tp: 8 - ep: 8 - dp-attn: true + ep: 1 + dp-attn: false additional-settings: - - "PREFILL_NODES=2" + - "PREFILL_NODES=1" decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - - "DECODE_NODES=1" + - "DECODE_NODES=2" - "DECODE_MTP_SIZE=1" From 70a707a90057669bba23a9978ef460af2ec76524 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sat, 28 Feb 2026 15:02:18 +0000 Subject: [PATCH 10/17] add change log --- perf-changelog.yaml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 9c4c9e438..74790a5f5 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -733,4 +733,16 @@ - "Extend concurrency range to conc-end: 256 across all sequence lengths (1k1k, 1k8k, 8k1k)" - "Fix MTP 1k8k conc-start from 256 to 4 to enable full concurrency sweep" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/699 + +- config-keys: + - dsr1-fp8-mi355x-sglang-disagg + - dsr1-fp8-mi355x-sglang-disagg-mtp + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp + description: + - "Add more sweep configs for MI355X FP8/FP4 Disagg" + - "Add TP/DP/EP size < 8 support " + - "Support DSR1-0528 MTP Disagg" + - "Bump SGL mori image to Feb 27" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823 From 52f97ca1f12742d577af9c4e2cc44326ec6a3769 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sun, 1 Mar 2026 13:20:39 +0000 Subject: [PATCH 11/17] add tp4 support --- benchmarks/multi_node/amd_utils/server.sh | 5 +++-- benchmarks/multi_node/amd_utils/submit.sh | 10 ++++++---- benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh | 1 + benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh | 1 + 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index dadea4728..a64f85c75 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -220,8 +220,9 @@ fi # ============================================================================= IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" -PREFILL_NODES_PER_WORKER=$((PREFILL_TP_SIZE / GPUS_PER_NODE)) -DECODE_NODES_PER_WORKER=$((DECODE_TP_SIZE / GPUS_PER_NODE)) +# Ceiling division by GPUS_PER_NODE for nodes-per-worker +PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) +DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) # Build prefill arguments dynamically based on xP diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index a2c3622b9..44bd130f3 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -70,8 +70,10 @@ PREFILL_ENABLE_EP=${9:-1} PREFILL_ENABLE_DP=${10:-1} DECODE_ENABLE_EP=${11:-1} DECODE_ENABLE_DP=${12:-1} -RANDOM_RANGE_RATIO=${13} -NODE_LIST=${14} +PREFILL_TP=${13:-8} +DECODE_TP=${14:-8} +RANDOM_RANGE_RATIO=${15} +NODE_LIST=${16} NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) @@ -89,10 +91,10 @@ export yD=$DECODE_WORKERS export NUM_NODES=$NUM_NODES export GPUS_PER_NODE=$GPUS_PER_NODE export MODEL_NAME=$MODEL_NAME -export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $GPUS_PER_NODE / $PREFILL_WORKERS )) +export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS )) export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP} export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP} -export DECODE_TP_SIZE=$(( $DECODE_NODES * $GPUS_PER_NODE / $DECODE_WORKERS )) +export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS )) export DECODE_ENABLE_EP=${DECODE_ENABLE_EP} export DECODE_ENABLE_DP=${DECODE_ENABLE_DP} export DECODE_MTP_SIZE=${DECODE_MTP_SIZE} diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh index f568b2b5d..6a7314ab4 100644 --- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh @@ -71,6 +71,7 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $ISL $OSL "${CONC_LIST// /x}" inf \ ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ ${RANDOM_RANGE_RATIO}) if [[ $? -ne 0 ]]; then diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh index 9a0ea92e7..0124d4b4d 100644 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh @@ -71,6 +71,7 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $ISL $OSL "${CONC_LIST// /x}" inf \ ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ ${RANDOM_RANGE_RATIO}) if [[ $? -ne 0 ]]; then From 7f664ce905f7d09c9ec850c3a083311389bd687e Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sun, 1 Mar 2026 14:31:52 +0000 Subject: [PATCH 12/17] fix fp4 configs --- .github/configs/amd-master.yaml | 82 ++++++++++++++++++++++++--------- 1 file changed, 60 insertions(+), 22 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index fb562fbc7..5f40e62a9 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -464,7 +464,7 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" - spec-decoding: "none" - conc-list: [ 2, 1 ] + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 tp: 4 @@ -526,7 +526,7 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" - spec-decoding: "none" - conc-list: [ 128, 2, 1 ] + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 tp: 4 @@ -619,7 +619,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" - spec-decoding: "mtp" - conc-list: [ 2, 1 ] + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 tp: 4 @@ -681,7 +681,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "DECODE_MTP_SIZE=2" - spec-decoding: "mtp" - conc-list: [ 4, 2, 1 ] + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 tp: 4 @@ -858,7 +858,7 @@ dsr1-fp4-mi355x-sglang-disagg: # 1P2D TP8 - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ] + conc-list: [ 2, 4, 8, 16, 32 ] prefill: num-worker: 1 tp: 8 @@ -875,20 +875,39 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - # 1P2D DEP8+TEP8 - - spec-decoding: "none" - conc-list: [ 256, 512 ] + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] prefill: num-worker: 1 tp: 8 - ep: 8 - dp-attn: true + ep: 1 + dp-attn: false additional-settings: - "PREFILL_NODES=1" decode: num-worker: 2 tp: 8 - ep: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 dp-attn: false additional-settings: - "DECODE_NODES=2" @@ -918,8 +937,8 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" # 1P2D TP8 - - spec-decoding: "none" - conc-list: [ 2, 4, 8, 16, 32, 64, 128, 256 ] + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] prefill: num-worker: 1 tp: 8 @@ -936,23 +955,42 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - # 2P1D DEP8+TEP8 - - spec-decoding: "none" - conc-list: [ 256, 512 ] + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] prefill: - num-worker: 2 + num-worker: 1 tp: 8 - ep: 8 - dp-attn: true + ep: 1 + dp-attn: false additional-settings: - - "PREFILL_NODES=2" + - "PREFILL_NODES=1" decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" + decode: + num-worker: 2 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - - "DECODE_NODES=1" + - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" dsr1-fp4-mi355x-sglang-disagg-mtp: From 02a9806f030d12db61e1c0319364914ffa95c3d0 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Sun, 1 Mar 2026 15:38:10 +0000 Subject: [PATCH 13/17] fix --- .github/configs/amd-master.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5f40e62a9..3c277cfec 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -876,7 +876,7 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" # 1P2D TP8 - - spec-decoding: "mtp" + - spec-decoding: "none" conc-list: [ 64, 128, 256 ] prefill: num-worker: 1 @@ -895,7 +895,7 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" # 1P2D TP4 - - spec-decoding: "mtp" + - spec-decoding: "none" conc-list: [ 64, 128, 256 ] prefill: num-worker: 1 @@ -937,7 +937,7 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" # 1P2D TP8 - - spec-decoding: "mtp" + - spec-decoding: "none" conc-list: [ 2, 4, 8, 16, 32 ] prefill: num-worker: 1 @@ -956,7 +956,7 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" # 1P2D TP8 - - spec-decoding: "mtp" + - spec-decoding: "none" conc-list: [ 64, 128, 256 ] prefill: num-worker: 1 @@ -975,7 +975,7 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" # 1P2D TP4 - - spec-decoding: "mtp" + - spec-decoding: "none" conc-list: [ 64, 128, 256 ] prefill: num-worker: 1 From e95f94ba7ac9f483b2a7c600fe6138c650cca9d6 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 2 Mar 2026 14:00:18 +0000 Subject: [PATCH 14/17] fix ep16 config --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index c2b5f4275..f99bfcf10 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -492,7 +492,7 @@ dsr1-fp8-mi355x-sglang-disagg: ep: 16 dp-attn: true additional-settings: - - "DECODE_NODES=2" + - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8) @@ -647,7 +647,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: ep: 16 dp-attn: true additional-settings: - - "DECODE_NODES=2" + - "DECODE_NODES=1" - "DECODE_MTP_SIZE=1" # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) From 0d7699d09cbb883479f1945315caff45b461e090 Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 2 Mar 2026 14:25:02 +0000 Subject: [PATCH 15/17] fix 1k1k config --- .github/configs/amd-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index f99bfcf10..8406045ad 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -652,7 +652,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) - spec-decoding: "mtp" - conc-list: [ 1536, 1024, 512, 256 ] + conc-list: [ 1536, 1024, 512 ] prefill: num-worker: 1 tp: 8 From 57fe4408f3676a1b697a1334161eec4501aa63fc Mon Sep 17 00:00:00 2001 From: billishyahao Date: Mon, 2 Mar 2026 16:22:14 +0000 Subject: [PATCH 16/17] fix ep16 config --- .github/configs/amd-master.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8406045ad..2ebe168b5 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -488,11 +488,11 @@ dsr1-fp8-mi355x-sglang-disagg: - "PREFILL_NODES=1" decode: num-worker: 1 - tp: 16 - ep: 16 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "DECODE_NODES=1" + - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8) @@ -643,11 +643,11 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: - "PREFILL_NODES=1" decode: num-worker: 1 - tp: 16 - ep: 16 + tp: 8 + ep: 8 dp-attn: true additional-settings: - - "DECODE_NODES=1" + - "DECODE_NODES=2" - "DECODE_MTP_SIZE=1" # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) From 971e78d2e5647f5cd4cc245c53a5edf84db1548a Mon Sep 17 00:00:00 2001 From: billishyahao Date: Tue, 3 Mar 2026 02:49:27 +0000 Subject: [PATCH 17/17] fix fp8 config --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 2ebe168b5..537bc48a2 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -497,7 +497,7 @@ dsr1-fp8-mi355x-sglang-disagg: # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8) - spec-decoding: "none" - conc-list: [ 1536, 1024, 512, 256 ] + conc-list: [ 1536, 1024, 512 ] prefill: num-worker: 1 tp: 8 @@ -652,7 +652,7 @@ dsr1-fp8-mi355x-sglang-disagg-mtp: # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) - spec-decoding: "mtp" - conc-list: [ 1536, 1024, 512 ] + conc-list: [ 1536, 1024, 512, 256 ] prefill: num-worker: 1 tp: 8