Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
477 changes: 237 additions & 240 deletions .github/configs/amd-master.yaml

Large diffs are not rendered by default.

70 changes: 66 additions & 4 deletions benchmarks/multi_node/amd_utils/models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@
DeepSeek-V3:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
mem_fraction_static: 0.8
disable_radix_cache: true
Expand Down Expand Up @@ -71,7 +71,7 @@ DeepSeek-V3:
DeepSeek-V3-0324:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
mem_fraction_static: 0.8
disable_radix_cache: true
Expand Down Expand Up @@ -102,7 +102,38 @@ DeepSeek-V3-0324:
DeepSeek-R1:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
mem_fraction_static: 0.8
disable_radix_cache: true
dp:
max_running_requests: 24
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
cuda_graph_bs: "1 2 3"
no_dp:
max_running_requests: 128
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-128"
decode:
mem_fraction_static: 0.85
prefill_round_robin_balance: true
dp:
max_running_requests: 4096
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
cuda_graph_bs_range: "1-160"
ep_only:
max_running_requests: 256
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-256"
no_dp:
max_running_requests: 128
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-128"

DeepSeek-R1-0528:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
mem_fraction_static: 0.8
disable_radix_cache: true
Expand Down Expand Up @@ -133,7 +164,38 @@ DeepSeek-R1:
DeepSeek-R1-0528-MXFP4-Preview:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter"
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
dp_flags: "--moe-a2a-backend mori --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
mem_fraction_static: 0.8
disable_radix_cache: true
dp:
max_running_requests: 24
chunked_prefill_size: 16384
cuda_graph_bs: "1 2 3"
no_dp:
max_running_requests: 128
chunked_prefill_size: 16384
cuda_graph_bs_range: "1-128"
decode:
mem_fraction_static: 0.85
prefill_round_robin_balance: true
dp:
max_running_requests: 4096
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
cuda_graph_bs_range: "1-160"
ep_only:
max_running_requests: 256
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-256"
no_dp:
max_running_requests: 128
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-128"

DeepSeek-R1-0528-MXFP4:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter"
mtp_flags: "--speculative-algorithm NEXTN --speculative-eagle-topk 1"
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head"
prefill:
mem_fraction_static: 0.8
disable_radix_cache: true
Expand Down
5 changes: 3 additions & 2 deletions benchmarks/multi_node/amd_utils/server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -220,8 +220,9 @@ fi
# =============================================================================
IFS=',' read -ra IP_ARRAY <<< "$IPADDRS"

PREFILL_NODES_PER_WORKER=$((PREFILL_TP_SIZE / GPUS_PER_NODE))
DECODE_NODES_PER_WORKER=$((DECODE_TP_SIZE / GPUS_PER_NODE))
# Ceiling division by GPUS_PER_NODE for nodes-per-worker
PREFILL_NODES_PER_WORKER=$(((PREFILL_TP_SIZE + 7) / GPUS_PER_NODE))
DECODE_NODES_PER_WORKER=$(((DECODE_TP_SIZE + 7) / GPUS_PER_NODE))
NODE_OFFSET=$((PREFILL_NODES_PER_WORKER * xP))

# Build prefill arguments dynamically based on xP
Expand Down
10 changes: 6 additions & 4 deletions benchmarks/multi_node/amd_utils/submit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -70,8 +70,10 @@ PREFILL_ENABLE_EP=${9:-1}
PREFILL_ENABLE_DP=${10:-1}
DECODE_ENABLE_EP=${11:-1}
DECODE_ENABLE_DP=${12:-1}
RANDOM_RANGE_RATIO=${13}
NODE_LIST=${14}
PREFILL_TP=${13:-8}
DECODE_TP=${14:-8}
RANDOM_RANGE_RATIO=${15}
NODE_LIST=${16}


NUM_NODES=$((PREFILL_NODES + DECODE_NODES))
Expand All @@ -89,10 +91,10 @@ export yD=$DECODE_WORKERS
export NUM_NODES=$NUM_NODES
export GPUS_PER_NODE=$GPUS_PER_NODE
export MODEL_NAME=$MODEL_NAME
export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $GPUS_PER_NODE / $PREFILL_WORKERS ))
export PREFILL_TP_SIZE=$(( $PREFILL_NODES * $PREFILL_TP / $PREFILL_WORKERS ))
export PREFILL_ENABLE_EP=${PREFILL_ENABLE_EP}
export PREFILL_ENABLE_DP=${PREFILL_ENABLE_DP}
export DECODE_TP_SIZE=$(( $DECODE_NODES * $GPUS_PER_NODE / $DECODE_WORKERS ))
export DECODE_TP_SIZE=$(( $DECODE_NODES * $DECODE_TP / $DECODE_WORKERS ))
export DECODE_ENABLE_EP=${DECODE_ENABLE_EP}
export DECODE_ENABLE_DP=${DECODE_ENABLE_DP}
export DECODE_MTP_SIZE=${DECODE_MTP_SIZE}
Expand Down
13 changes: 12 additions & 1 deletion benchmarks/multi_node/dsr1_fp4_mi355x_sglang-disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,17 +33,27 @@ cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
# Set up SGL launch script-specific environment variables
export TIME_LIMIT="08:00:00"
export MODEL_PATH=$MODEL_PATH
export MODEL_NAME="DeepSeek-R1-0528-MXFP4-Preview"
export MODEL_NAME=$MODEL_NAME
export CONTAINER_IMAGE=$IMAGE

if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
export PREFILL_ENABLE_EP=false
else
export PREFILL_ENABLE_EP=true
fi

if [[ "$PREFILL_DP_ATTN" == "true" ]]; then
export PREFILL_ENABLE_DP=true
else
export PREFILL_ENABLE_DP=false
fi

if [[ "${DECODE_EP:-1}" -eq 1 ]]; then
export DECODE_ENABLE_EP=false
else
export DECODE_ENABLE_EP=true
fi

if [[ "$DECODE_DP_ATTN" == "true" ]]; then
export DECODE_ENABLE_DP=true
else
Expand All @@ -61,6 +71,7 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
$ISL $OSL "${CONC_LIST// /x}" inf \
${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
${PREFILL_TP} ${DECODE_TP} \
${RANDOM_RANGE_RATIO})

if [[ $? -ne 0 ]]; then
Expand Down
3 changes: 2 additions & 1 deletion benchmarks/multi_node/dsr1_fp8_mi355x_sglang-disagg.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ cd "$GITHUB_WORKSPACE/benchmarks/multi_node/amd_utils" || exit 1
# Set up SGL launch script-specific environment variables
export TIME_LIMIT="08:00:00"
export MODEL_PATH=$MODEL_PATH
export MODEL_NAME="DeepSeek-R1"
export MODEL_NAME=$MODEL_NAME
export CONTAINER_IMAGE=$IMAGE

if [[ "${PREFILL_EP:-1}" -eq 1 ]]; then
Expand Down Expand Up @@ -71,6 +71,7 @@ JOB_ID=$(bash ./submit.sh $PREFILL_NODES \
$ISL $OSL "${CONC_LIST// /x}" inf \
${PREFILL_ENABLE_EP} ${PREFILL_ENABLE_DP} \
${DECODE_ENABLE_EP} ${DECODE_ENABLE_DP} \
${PREFILL_TP} ${DECODE_TP} \
${RANDOM_RANGE_RATIO})

if [[ $? -ne 0 ]]; then
Expand Down
12 changes: 12 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -835,6 +835,18 @@
- "following https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/839

- config-keys:
- dsr1-fp8-mi355x-sglang-disagg
- dsr1-fp8-mi355x-sglang-disagg-mtp
- dsr1-fp4-mi355x-sglang-disagg
- dsr1-fp4-mi355x-sglang-disagg-mtp
description:
- "Add more sweep configs for MI355X FP8/FP4 Disagg"
- "Add TP/DP/EP size < 8 support "
- "Support DSR1-0528 MTP Disagg"
- "Bump SGL mori image to Feb 27"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/823

- config-keys:
- minimaxm2.5-fp8-h100-vllm
description:
Expand Down
2 changes: 1 addition & 1 deletion runners/launch_mi355x-amds.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ if [[ "$IS_MULTINODE" == "true" ]]; then
export SLURM_PARTITION="compute"
export SLURM_JOB_NAME="benchmark-sglang-disagg.job"

export MODEL_NAME="DeepSeek-R1"
export MODEL_NAME=${MODEL##*/}
export MODEL_PATH="/it-share/data"
export IBDEVICES="rdma0,rdma1,rdma2,rdma3,rdma4,rdma5,rdma6,rdma7"
export MORI_RDMA_TC=104
Expand Down
Loading