diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index b316bcede..00fd01936 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -551,7 +551,7 @@ dsr1-fp8-mi355x-atom-mtp: - { tp: 8, conc-start: 4, conc-end: 256, spec-decoding: mtp } dsr1-fp8-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.8-rocm700-mi35x-mori-0210 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: mi355x-disagg @@ -563,9 +563,9 @@ dsr1-fp8-mi355x-sglang-disagg: - isl: 1024 osl: 1024 search-space: - # MTP configurations - # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) - - spec-decoding: "mtp" + # non-MTP configurations + # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) + - spec-decoding: "none" conc-list: [ 1024, 2048 ] prefill: num-worker: 1 @@ -576,16 +576,16 @@ dsr1-fp8-mi355x-sglang-disagg: - "PREFILL_NODES=1" decode: num-worker: 1 - tp: 1 - ep: 16 + tp: 8 + ep: 8 dp-attn: true additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" + - "DECODE_MTP_SIZE=0" - # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8) - - spec-decoding: "mtp" - conc-list: [ 1536, 1024, 512, 256 ] + # "Middle of curve" (1 prefill workers each at TP8 and 2 decode workers at DEP8) + - spec-decoding: "none" + conc-list: [ 1536, 1024, 512 ] prefill: num-worker: 1 tp: 8 @@ -595,17 +595,17 @@ dsr1-fp8-mi355x-sglang-disagg: - "PREFILL_NODES=1" decode: num-worker: 2 - tp: 1 + tp: 8 ep: 8 dp-attn: true additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" + - "DECODE_MTP_SIZE=0" # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "mtp" - conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] + - spec-decoding: "none" + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] prefill: num-worker: 1 tp: 8 @@ -621,13 +621,13 @@ dsr1-fp8-mi355x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" + - "DECODE_MTP_SIZE=0" - - spec-decoding: "mtp" - conc-list: [ 4, 2, 1 ] + - spec-decoding: "none" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 - tp: 8 + tp: 4 ep: 1 dp-attn: false additional-settings: @@ -640,51 +640,34 @@ dsr1-fp8-mi355x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=2" + - "DECODE_MTP_SIZE=0" + - isl: 8192 + osl: 1024 + search-space: # non-MTP configurations - # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) + # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) - spec-decoding: "none" conc-list: [ 1024, 2048 ] prefill: - num-worker: 1 + num-worker: 2 tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 1 - ep: 16 + ep: 8 dp-attn: true additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - - spec-decoding: "none" - conc-list: [ 1536, 1024, 512, 256 ] - prefill: + - "PREFILL_NODES=2" + decode: num-worker: 1 tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 1 ep: 8 dp-attn: true additional-settings: - - "DECODE_NODES=2" + - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) + # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) - spec-decoding: "none" - conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] prefill: num-worker: 1 tp: 8 @@ -703,10 +686,10 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=0" - spec-decoding: "none" - conc-list: [ 4, 2, 1 ] + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 - tp: 8 + tp: 4 ep: 1 dp-attn: false additional-settings: @@ -721,78 +704,53 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=0" - - isl: 8192 + +dsr1-fp8-mi355x-sglang-disagg-mtp: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + model: deepseek-ai/DeepSeek-R1-0528 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp8 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 osl: 1024 search-space: # MTP configurations + # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) - spec-decoding: "mtp" conc-list: [ 1024, 2048 ] prefill: - num-worker: 2 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=2" - decode: num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - - spec-decoding: "mtp" - conc-list: [ 512, 1024 ] - prefill: - num-worker: 2 tp: 8 ep: 1 dp-attn: false additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=1" - - # "Top of curve" (1 prefill worker at DEP8 and 1 decode worker at DEP16) - - spec-decoding: "mtp" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - "PREFILL_NODES=1" decode: num-worker: 1 - tp: 1 - ep: 16 + tp: 8 + ep: 8 dp-attn: true additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=1" - # "Middle of curve" (1 prefill worker at DEP8 and 2 decode workers each at DEP8) + # "Middle of curve" (1 prefill worker at TP8 and 2 decode workers each at DEP8) - spec-decoding: "mtp" - conc-list: [ 1536, 1024 ] + conc-list: [ 1536, 1024, 512, 256 ] prefill: num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true + tp: 8 + ep: 1 + dp-attn: false additional-settings: - "PREFILL_NODES=1" decode: num-worker: 2 - tp: 1 + tp: 8 ep: 8 dp-attn: true additional-settings: @@ -802,7 +760,7 @@ dsr1-fp8-mi355x-sglang-disagg: # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - spec-decoding: "mtp" - conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] + conc-list: [ 256, 128, 64, 32, 16, 8, 4 ] prefill: num-worker: 1 tp: 8 @@ -821,10 +779,10 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_MTP_SIZE=2" - spec-decoding: "mtp" - conc-list: [ 4, 2, 1 ] + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 - tp: 8 + tp: 4 ep: 1 dp-attn: false additional-settings: @@ -839,83 +797,31 @@ dsr1-fp8-mi355x-sglang-disagg: - "DECODE_NODES=1" - "DECODE_MTP_SIZE=2" - # non-MTP configurations - - spec-decoding: "none" + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # "Top of curve" (2 prefill worker at DEP8 and 1 decode worker at DEP8) + - spec-decoding: "mtp" conc-list: [ 1024, 2048 ] prefill: num-worker: 2 - tp: 1 + tp: 8 ep: 8 dp-attn: true additional-settings: - "PREFILL_NODES=2" decode: num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - - spec-decoding: "none" - conc-list: [ 512, 1024 ] - prefill: - num-worker: 2 tp: 8 - ep: 1 - dp-attn: false - additional-settings: - - "PREFILL_NODES=2" - decode: - num-worker: 1 - tp: 1 ep: 8 dp-attn: true additional-settings: - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" - - # "Top of curve" (1 prefill workers each at DEP8 and 1 decode workers at DEP16) - - spec-decoding: "none" - conc-list: [ 1024, 2048 ] - prefill: - num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 1 - tp: 1 - ep: 16 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" - - # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - - spec-decoding: "none" - conc-list: [ 1536, 1024 ] - prefill: - num-worker: 1 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "PREFILL_NODES=1" - decode: - num-worker: 2 - tp: 1 - ep: 8 - dp-attn: true - additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + - "DECODE_MTP_SIZE=1" - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "none" + # "Bottom of curve" (1 prefill worker at TP8 and 2 decode workers at TP8) + - spec-decoding: "mtp" conc-list: [ 256, 128, 64, 32, 16, 8, 4, 2 ] prefill: num-worker: 1 @@ -932,13 +838,13 @@ dsr1-fp8-mi355x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + - "DECODE_MTP_SIZE=2" - - spec-decoding: "none" - conc-list: [ 4, 2, 1 ] + - spec-decoding: "mtp" + conc-list: [ 64, 32, 16, 8, 4, 2, 1 ] prefill: num-worker: 1 - tp: 8 + tp: 4 ep: 1 dp-attn: false additional-settings: @@ -951,7 +857,7 @@ dsr1-fp8-mi355x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=1" - - "DECODE_MTP_SIZE=0" + - "DECODE_MTP_SIZE=2" # FIXME(billishyahao): disable 1k8k for now # - isl: 1024 @@ -1078,7 +984,7 @@ dsr1-fp8-mi355x-sglang-disagg: dsr1-fp4-mi355x-sglang-disagg: - image: rocm/sgl-dev:sglang-0.5.7-rocm700-mi35x-mori-fp4-0122 + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 model: amd/DeepSeek-R1-0528-MXFP4 model-prefix: dsr1 runner: mi355x-disagg @@ -1090,48 +996,48 @@ dsr1-fp4-mi355x-sglang-disagg: - isl: 1024 osl: 1024 search-space: - # MTP configurations - # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - - spec-decoding: "mtp" - conc-list: [ 1024, 512, 256 ] + # non-MTP configurations + # 1P1D TP8 + - spec-decoding: "none" + conc-list: [ 1, 2, 4, 8 ] prefill: num-worker: 1 - tp: 1 - ep: 8 + tp: 8 + ep: 1 dp-attn: false additional-settings: - "PREFILL_NODES=1" decode: - num-worker: 2 - tp: 1 - ep: 8 + num-worker: 1 + tp: 8 + ep: 1 dp-attn: false additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "mtp" - conc-list: [ 32, 64, 128 ] + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] prefill: num-worker: 1 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - "PREFILL_NODES=1" - decode: num-worker: 2 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" + - "DECODE_MTP_SIZE=0" - - spec-decoding: "mtp" - conc-list: [ 128, 64, 32, 16, 8, 4 ] + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] prefill: num-worker: 1 tp: 8 @@ -1139,7 +1045,6 @@ dsr1-fp4-mi355x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" - decode: num-worker: 2 tp: 8 @@ -1147,50 +1052,72 @@ dsr1-fp4-mi355x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" + - "DECODE_MTP_SIZE=0" - # non-MTP configurations - # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - - spec-decoding: "none" - conc-list: [ 1024, 512, 256 ] + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] prefill: num-worker: 1 - tp: 1 - ep: 8 + tp: 4 + ep: 1 dp-attn: false additional-settings: - "PREFILL_NODES=1" decode: num-worker: 2 - tp: 1 - ep: 8 + tp: 8 + ep: 1 dp-attn: false additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) + - isl: 8192 + osl: 1024 + search-space: + # non-MTP configurations + # 1P1D pure TP8 - spec-decoding: "none" - conc-list: [ 32, 64, 128 ] + conc-list: [ 1, 2, 4, 8 ] prefill: num-worker: 1 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=0" + # 1P2D TP8 + - spec-decoding: "none" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" decode: num-worker: 2 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" + # 1P2D TP8 - spec-decoding: "none" - conc-list: [ 128, 64, 32, 16, 8, 4 ] + conc-list: [ 64, 128, 256 ] prefill: num-worker: 1 tp: 8 @@ -1198,7 +1125,25 @@ dsr1-fp4-mi355x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=0" + # 1P2D TP4 + - spec-decoding: "none" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" decode: num-worker: 2 tp: 8 @@ -1208,51 +1153,61 @@ dsr1-fp4-mi355x-sglang-disagg: - "DECODE_NODES=2" - "DECODE_MTP_SIZE=0" - - isl: 8192 +dsr1-fp4-mi355x-sglang-disagg-mtp: + image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-2 + model: amd/DeepSeek-R1-0528-MXFP4 + model-prefix: dsr1 + runner: mi355x-disagg + precision: fp4 + framework: sglang-disagg + multinode: true + disagg: true + seq-len-configs: + - isl: 1024 osl: 1024 search-space: # MTP configurations - # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) + # 1P1D TP8 - spec-decoding: "mtp" - conc-list: [ 1024, 512, 256 ] + conc-list: [ 1, 2, 4, 8 ] prefill: num-worker: 1 - tp: 1 - ep: 8 + tp: 8 + ep: 1 dp-attn: false additional-settings: - "PREFILL_NODES=1" decode: - num-worker: 2 - tp: 1 - ep: 8 + num-worker: 1 + tp: 8 + ep: 1 dp-attn: false additional-settings: - - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "mtp" - conc-list: [ 32, 64, 128 ] + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] prefill: num-worker: 1 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - "PREFILL_NODES=1" - decode: num-worker: 2 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=1" + - "DECODE_MTP_SIZE=3" - - spec-decoding: "mtp" - conc-list: [ 128, 64, 32, 16, 8, 4 ] + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] prefill: num-worker: 1 tp: 8 @@ -1260,7 +1215,6 @@ dsr1-fp4-mi355x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" - decode: num-worker: 2 tp: 8 @@ -1268,50 +1222,74 @@ dsr1-fp4-mi355x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=2" + - "DECODE_MTP_SIZE=1" - # non-MTP configurations - # "Middle of curve" (1 prefill workers each at DEP8 and 2 decode workers at DEP8) - - spec-decoding: "none" - conc-list: [ 1024, 512, 256 ] + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] prefill: num-worker: 1 - tp: 1 - ep: 8 + tp: 4 + ep: 1 dp-attn: false additional-settings: - "PREFILL_NODES=1" decode: num-worker: 2 - tp: 1 - ep: 8 + tp: 8 + ep: 1 dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + - "DECODE_MTP_SIZE=1" - # "Bottom of curve" (1 prefill worker at TEP8 and 2 decode workers at TEP8) - - spec-decoding: "none" - conc-list: [ 32, 64, 128 ] + + - isl: 8192 + osl: 1024 + search-space: + # MTP configurations + # 1P1D pure TP8 + - spec-decoding: "mtp" + conc-list: [ 1, 2, 4, 8 ] prefill: num-worker: 1 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - "PREFILL_NODES=1" + decode: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=1" + - "DECODE_MTP_SIZE=3" + + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 2, 4, 8, 16, 32 ] + prefill: + num-worker: 1 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" decode: num-worker: 2 tp: 8 - ep: 8 + ep: 1 dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + - "DECODE_MTP_SIZE=3" - - spec-decoding: "none" - conc-list: [ 128, 64, 32, 16, 8, 4 ] + # 1P2D TP8 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] prefill: num-worker: 1 tp: 8 @@ -1319,7 +1297,25 @@ dsr1-fp4-mi355x-sglang-disagg: dp-attn: false additional-settings: - "PREFILL_NODES=1" + decode: + num-worker: 2 + tp: 8 + ep: 1 + dp-attn: false + additional-settings: + - "DECODE_NODES=2" + - "DECODE_MTP_SIZE=1" + # 1P2D TP4 + - spec-decoding: "mtp" + conc-list: [ 64, 128, 256 ] + prefill: + num-worker: 1 + tp: 4 + ep: 1 + dp-attn: false + additional-settings: + - "PREFILL_NODES=1" decode: num-worker: 2 tp: 8 @@ -1327,7 +1323,8 @@ dsr1-fp4-mi355x-sglang-disagg: dp-attn: false additional-settings: - "DECODE_NODES=2" - - "DECODE_MTP_SIZE=0" + - "DECODE_MTP_SIZE=1" + # FIXME(billishyahao): disable FP4 1k8k for now # - isl: 1024 diff --git a/benchmarks/multi_node/amd_utils/models.yaml b/benchmarks/multi_node/amd_utils/models.yaml index edfebc755..5280b6767 100644 --- a/benchmarks/multi_node/amd_utils/models.yaml +++ b/benchmarks/multi_node/amd_utils/models.yaml @@ -40,7 +40,7 @@ DeepSeek-V3: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -71,7 +71,7 @@ DeepSeek-V3: DeepSeek-V3-0324: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -102,7 +102,38 @@ DeepSeek-V3-0324: DeepSeek-R1: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE" + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + +DeepSeek-R1-0528: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 disable_radix_cache: true @@ -133,7 +164,38 @@ DeepSeek-R1: DeepSeek-R1-0528-MXFP4-Preview: base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter" mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" - dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" + prefill: + mem_fraction_static: 0.8 + disable_radix_cache: true + dp: + max_running_requests: 24 + chunked_prefill_size: 16384 + cuda_graph_bs: "1 2 3" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 16384 + cuda_graph_bs_range: "1-128" + decode: + mem_fraction_static: 0.85 + prefill_round_robin_balance: true + dp: + max_running_requests: 4096 + chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE" + cuda_graph_bs_range: "1-160" + ep_only: + max_running_requests: 256 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-256" + no_dp: + max_running_requests: 128 + chunked_prefill_size: 262144 + cuda_graph_bs_range: "1-128" + +DeepSeek-R1-0528-MXFP4: + base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter" + mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1" + dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head" prefill: mem_fraction_static: 0.8 disable_radix_cache: true diff --git a/benchmarks/multi_node/amd_utils/server.sh b/benchmarks/multi_node/amd_utils/server.sh index dadea4728..a64f85c75 100755 --- a/benchmarks/multi_node/amd_utils/server.sh +++ b/benchmarks/multi_node/amd_utils/server.sh @@ -220,8 +220,9 @@ fi # ============================================================================= IFS=',' read -ra IP_ARRAY <<< "$IPADDRS" -PREFILL_NODES_PER_WORKER=$((PREFILL_TP_SIZE / GPUS_PER_NODE)) -DECODE_NODES_PER_WORKER=$((DECODE_TP_SIZE / GPUS_PER_NODE)) +# Ceiling division by GPUS_PER_NODE for nodes-per-worker +PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE)) +DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE)) NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP)) # Build prefill arguments dynamically based on xP diff --git a/benchmarks/multi_node/amd_utils/submit.sh b/benchmarks/multi_node/amd_utils/submit.sh index a2c3622b9..44bd130f3 100755 --- a/benchmarks/multi_node/amd_utils/submit.sh +++ b/benchmarks/multi_node/amd_utils/submit.sh @@ -70,8 +70,10 @@ PREFILL_ENABLE_EP=${9:-1} PREFILL_ENABLE_DP=${10:-1} DECODE_ENABLE_EP=${11:-1} DECODE_ENABLE_DP=${12:-1} -RANDOM_RANGE_RATIO=${13} -NODE_LIST=${14} +PREFILL_TP=${13:-8} +DECODE_TP=${14:-8} +RANDOM_RANGE_RATIO=${15} +NODE_LIST=${16} NUM_NODES=$((PREFILL_NODES + DECODE_NODES)) @@ -89,10 +91,10 @@ export yD=$DECODE_WORKERS export NUM_NODES=$NUM_NODES export GPUS_PER_NODE=$GPUS_PER_NODE export MODEL_NAME=$MODEL_NAME -export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $GPUS_PER_NODE / $PREFILL_WORKERS )) +export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS )) export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP} export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP} -export DECODE_TP_SIZE=$(( $DECODE_NODES * $GPUS_PER_NODE / $DECODE_WORKERS )) +export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS )) export DECODE_ENABLE_EP=${DECODE_ENABLE_EP} export DECODE_ENABLE_DP=${DECODE_ENABLE_DP} export DECODE_MTP_SIZE=${DECODE_MTP_SIZE} diff --git a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh index aaaf97075..6a7314ab4 100644 --- a/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh @@ -33,17 +33,27 @@ cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 # Set up SGL launch script-specific environment variables export TIME_LIMIT="08:00:00" export MODEL_PATH=$MODEL_PATH -export MODEL_NAME="DeepSeek-R1-0528-MXFP4-Preview" +export MODEL_NAME=$MODEL_NAME export CONTAINER_IMAGE=$IMAGE +if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then +export PREFILL_ENABLE_EP=false +else export PREFILL_ENABLE_EP=true +fi + if [[ "$PREFILL_DP_ATTN" == "true" ]]; then export PREFILL_ENABLE_DP=true else export PREFILL_ENABLE_DP=false fi +if [[ "${DECODE_EP:-1}" -eq 1 ]]; then +export DECODE_ENABLE_EP=false +else export DECODE_ENABLE_EP=true +fi + if [[ "$DECODE_DP_ATTN" == "true" ]]; then export DECODE_ENABLE_DP=true else @@ -61,6 +71,7 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $ISL $OSL "${CONC_LIST// /x}" inf \ ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ ${RANDOM_RANGE_RATIO}) if [[ $? -ne 0 ]]; then diff --git a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh index 0a5a5c30b..0124d4b4d 100644 --- a/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh +++ b/benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh @@ -33,7 +33,7 @@ cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1 # Set up SGL launch script-specific environment variables export TIME_LIMIT="08:00:00" export MODEL_PATH=$MODEL_PATH -export MODEL_NAME="DeepSeek-R1" +export MODEL_NAME=$MODEL_NAME export CONTAINER_IMAGE=$IMAGE if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then @@ -71,6 +71,7 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \ $ISL $OSL "${CONC_LIST// /x}" inf \ ${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \ ${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \ + ${PREFILL_TP} ${DECODE_TP} \ ${RANDOM_RANGE_RATIO}) if [[ $? -ne 0 ]]; then diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a48f42b6b..05b5dd5ce 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -835,6 +835,18 @@ - "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/839 +- config-keys: + - dsr1-fp8-mi355x-sglang-disagg + - dsr1-fp8-mi355x-sglang-disagg-mtp + - dsr1-fp4-mi355x-sglang-disagg + - dsr1-fp4-mi355x-sglang-disagg-mtp + description: + - "Add more sweep configs for MI355X FP8/FP4 Disagg" + - "Add TP/DP/EP size < 8 support " + - "Support DSR1-0528 MTP Disagg" + - "Bump SGL mori image to Feb 27" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823 + - config-keys: - minimaxm2.5-fp8-h100-vllm description: diff --git a/runners/launch_mi355x-amds.sh b/runners/launch_mi355x-amds.sh index 2b9902b0b..f4f1e561f 100644 --- a/runners/launch_mi355x-amds.sh +++ b/runners/launch_mi355x-amds.sh @@ -34,7 +34,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then export SLURM_PARTITION="compute" export SLURM_JOB_NAME="benchmark-sglang-disagg.job" - export MODEL_NAME="DeepSeek-R1" + export MODEL_NAME=${MODEL##*/} export MODEL_PATH="/it-share/data" export IBDEVICES="rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7" export MORI_RDMA_TC=104