Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
57 commits
Select commit Hold shift + click to select a range
0383696
[AMD] add dsr1 mxfp4 v2 sweep points
billishyahao Mar 16, 2026
18e05b1
fix
billishyahao Mar 17, 2026
32b5d3d
Fix tokenizer mismatch between benchmark client and sglang server on …
ZhaiFeiyue Mar 24, 2026
0bd347f
change mtp model to fp8
billishyahao Mar 25, 2026
754e53c
change fp8 image
billishyahao Mar 25, 2026
f29f2d0
bump image to 0327
billishyahao Mar 27, 2026
a44c7eb
remove specv2
billishyahao Mar 27, 2026
2514136
consolidate dsr1 fp4 configs
billishyahao Mar 30, 2026
3b4d4ab
Merge remote-tracking branch 'inf/main' into amd/mi355x-dsfp4-march15
billishyahao Mar 30, 2026
682a4ab
bump fp8 image to 0327
billishyahao Mar 30, 2026
64bf100
fix crash
billishyahao Mar 30, 2026
c44e175
fix env
billishyahao Mar 30, 2026
0a41f89
cleanup
billishyahao Mar 31, 2026
7282748
add perf change log
billishyahao Mar 31, 2026
e6d4b32
add deprecate comments
billishyahao Mar 31, 2026
b7dd65f
add spec v2 env
billishyahao Apr 1, 2026
12a4ba0
bump the docker image
billishyahao Apr 2, 2026
597a458
add stream control to eliminate cpu overhead
billishyahao Apr 9, 2026
f715e47
tune the config
billishyahao Apr 10, 2026
2ea82d5
bump image
billishyahao Apr 11, 2026
16384e7
tune config
billishyahao Apr 11, 2026
4d733e7
add new exp config
billishyahao Apr 13, 2026
83af743
enable log level info
billishyahao Apr 13, 2026
0c3083e
fix mori env
billishyahao Apr 13, 2026
1c61622
bump image
billishyahao Apr 13, 2026
e2d2ac9
fix log
billishyahao Apr 13, 2026
d2a7988
bump the image
billishyahao Apr 14, 2026
b09ae6c
fix
billishyahao Apr 14, 2026
2c3ee04
fix
billishyahao Apr 14, 2026
69102f7
fix
billishyahao Apr 15, 2026
668068c
fix
billishyahao Apr 16, 2026
776fd42
bump image to 0416
billishyahao Apr 16, 2026
2471379
fix
billishyahao Apr 17, 2026
c80997f
set si to 100
billishyahao Apr 17, 2026
616c57d
bump the image
billishyahao Apr 18, 2026
3d62e2c
revert old image
billishyahao Apr 19, 2026
2c4c09d
revert old image
billishyahao Apr 19, 2026
1c9b8d2
increase DISPATCH_TOKENS_PREFILL to 5120
billishyahao Apr 20, 2026
8e6104e
bump image to 0417
billishyahao Apr 20, 2026
7cc5d81
add exp config
billishyahao Apr 21, 2026
a1c05da
add exp config
billishyahao Apr 22, 2026
a915729
add exp config
billishyahao Apr 23, 2026
44d10a1
add exp config
billishyahao Apr 23, 2026
f09820e
add exp configs
billishyahao Apr 24, 2026
5144ca1
add exp configs
billishyahao Apr 24, 2026
d9e2eef
bump image
billishyahao Apr 28, 2026
ee33925
sync arguments
billishyahao Apr 30, 2026
2b1ff6b
fix
billishyahao Apr 30, 2026
0548773
fix config
billishyahao May 1, 2026
724bd61
add exp configs
billishyahao May 1, 2026
f8f0a3a
enable sdma
billishyahao May 1, 2026
feb6c7d
fix
billishyahao May 1, 2026
f501a3e
fix
billishyahao May 1, 2026
217d892
cleanup
billishyahao May 1, 2026
a5a822a
bump image
billishyahao May 2, 2026
91e1396
Merge remote-tracking branch 'inf/main' into amd/mi355x-dsfp4-april14
billishyahao May 2, 2026
4d3eaf2
fix yaml
billishyahao May 2, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 18 additions & 17 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1147,10 +1147,9 @@ dsr1-fp8-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=2"


dsr1-fp4-mi355x-sglang-disagg:
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
model: amd/DeepSeek-R1-0528-MXFP4
image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
model: amd/DeepSeek-R1-0528-MXFP4-v2
model-prefix: dsr1
runner: mi355x-disagg
precision: fp4
Expand Down Expand Up @@ -1241,7 +1240,7 @@ dsr1-fp4-mi355x-sglang-disagg:

# 1*DEP4+ 1*DEP8
- spec-decoding: "none"
conc-list: [ 1024, 2048 ]
conc-list: [ 1024, 2048, 4096 ]
prefill:
num-worker: 1
tp: 4
Expand Down Expand Up @@ -1338,16 +1337,16 @@ dsr1-fp4-mi355x-sglang-disagg:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=0"

# 4*DEP4 + 1*DEP8
# 2*DEP8 + 1*DEP8
- spec-decoding: "none"
conc-list: [ 1024, 2048, 4096 ]
prefill:
num-worker: 4
tp: 4
ep: 4
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=4"
- "PREFILL_NODES=2"
decode:
num-worker: 1
tp: 8
Expand All @@ -1357,9 +1356,10 @@ dsr1-fp4-mi355x-sglang-disagg:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=0"


dsr1-fp4-mi355x-sglang-disagg-mtp:
image: rocm/sgl-dev:sglang-0.5.9-rocm720-mi35x-mori-0227-3
model: amd/DeepSeek-R1-0528-MXFP4
image: lmsysorg/sglang-rocm:v0.5.10.post1-rocm720-mi35x-20260501
model: amd/DeepSeek-R1-0528-MXFP4-v2
model-prefix: dsr1
runner: mi355x-disagg
precision: fp4
Expand Down Expand Up @@ -1450,7 +1450,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:

# 1*DEP4+ 1*DEP8
- spec-decoding: "mtp"
conc-list: [ 1024, 2048 ]
conc-list: [ 1024, 2048, 4096 ]
prefill:
num-worker: 1
tp: 4
Expand Down Expand Up @@ -1549,16 +1549,16 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
- "DECODE_NODES=2"
- "DECODE_MTP_SIZE=1"

# 4*DEP4 + 1*DEP8
# 2*DEP8 + 1*DEP8
- spec-decoding: "mtp"
conc-list: [ 1024, 2048, 4096 ]
prefill:
num-worker: 4
tp: 4
ep: 4
num-worker: 2
tp: 8
ep: 8
dp-attn: true
additional-settings:
- "PREFILL_NODES=4"
- "PREFILL_NODES=2"
decode:
num-worker: 1
tp: 8
Expand All @@ -1567,6 +1567,7 @@ dsr1-fp4-mi355x-sglang-disagg-mtp:
additional-settings:
- "DECODE_NODES=1"
- "DECODE_MTP_SIZE=1"


dsv4-fp8-mi355x-sglang:
image: rocm/sgl-dev:deepseek-v4-mi35x
Expand Down
53 changes: 34 additions & 19 deletions benchmarks/multi_node/amd_utils/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,39 +34,47 @@ export IBDEVICES
export GLOO_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)
export NCCL_SOCKET_IFNAME=$(ip route | grep '^default' | awk '{print $5}' | head -n 1)

set +x

export NCCL_IB_HCA=$IBDEVICES

export SGLANG_USE_AITER=1
export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=1200
export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=1200

export SGLANG_MORI_DISPATCH_DTYPE=auto
export SGLANG_MORI_FP8_COMB=true
export SGLANG_MORI_QP_PER_TRANSFER=4
export SGLANG_MORI_NUM_WORKERS=4
export MORI_IO_SQ_BACKOFF_TIMEOUT_US=50000

export MORI_IO_QP_MAX_SEND_WR=16384
export MORI_IO_QP_MAX_CQE=32768
export MORI_IO_QP_MAX_SGE=4

export MORI_IO_TC_DISABLE=0

export SGLANG_DISAGGREGATION_BOOTSTRAP_TIMEOUT=3600
export SGLANG_DISAGGREGATION_WAITING_TIMEOUT=3600

# Disable allocating memory in one pass
export MORI_SHMEM_MODE=ISOLATION
export SGLANG_MORI_FP8_DISP=True

if [[ "$MODEL_NAME" == *mxfp4* ]]; then
export SGLANG_MORI_FP8_DISP=False
fi
# Enable spec v2
export SGLANG_ENABLE_SPEC_V2=1
export SGLANG_ENABLE_OVERLAP_PLAN_STREAM=1

export SGLANG_MORI_FP4_DISP=False
export SGLANG_MORI_FP8_COMB=False
export SGLANG_LOG_MS=true
export SGLANG_DISAGGREGATION_NUM_PRE_ALLOCATE_REQS=32

# Per-role dispatch token limits (prefill uses higher throughput, decode uses lower)
export MORI_MAX_DISPATCH_TOKENS_PREFILL=16384
if [[ "$MODEL_NAME" == *mxfp4* ]]; then
export MORI_MAX_DISPATCH_TOKENS_PREFILL=12288
fi
export MORI_MAX_DISPATCH_TOKENS_DECODE=160
export MORI_MAX_DISPATCH_TOKENS_PREFILL=8192
export MORI_MAX_DISPATCH_TOKENS_DECODE=512

export MORI_MOE_MAX_INPUT_TOKENS_PREFILL=32768
export MORI_MOE_MAX_INPUT_TOKENS_DECODE=2703

# set MTP size=1 when EP16
export SGLANG_MORI_DISPATCH_INTER_KERNEL_SWITCH_THRESHOLD=$((MORI_MAX_DISPATCH_TOKENS_DECODE * 2))

export MORI_EP_LAUNCH_CONFIG_MODE=AUTO
export MORI_IO_QP_MAX_SEND_WR=16384
export MORI_IO_QP_MAX_CQE=32768
export MORI_IO_QP_MAX_SGE=4


export MORI_APP_LOG_LEVEL=INFO

Expand All @@ -89,17 +97,21 @@ $1 == "DSCP" && $2 == ":" && $NF == p {
if [[ -n "$ND_DSCP" ]] && [[ -n "$ND_PRIO" ]]; then
TC=$(( 4 * ND_DSCP ))
export MORI_RDMA_SL=$ND_PRIO
export MORI_IO_SL=$ND_PRIO
export MORI_RDMA_TC=$TC
echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL"
export MORI_IO_TC=$TC
echo "[INFO] Detected QoS config from nicctl: MORI_RDMA_TC=$MORI_RDMA_TC, MORI_RDMA_SL=$MORI_RDMA_SL, MORI_IO_TC=$MORI_IO_TC, MORI_IO_SL=$MORI_IO_SL"
else
echo "[WARN] nicctl available but QoS data unavailable; trying hostname detection."
# Fall back to hostname-based detection
NODENAME=$(hostname -s)
if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
export MORI_RDMA_TC=96
export MORI_IO_TC=96
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
export MORI_IO_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
else
echo "[INFO] Unable to detect MORI_RDMA_TC from hostname. Skipping RDMA QoS configuration."
Expand All @@ -110,9 +122,11 @@ else
NODENAME=$(hostname -s)
if [[ $NODENAME == GPU* ]] || [[ $NODENAME == smci355-ccs-aus* ]]; then
export MORI_RDMA_TC=96
export MORI_IO_TC=96
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
elif [[ $NODENAME == mia1* ]]; then
export MORI_RDMA_TC=104
export MORI_IO_TC=104
echo "[INFO] Auto-detected MORI_RDMA_TC=$MORI_RDMA_TC from hostname $NODENAME"
else
echo "[INFO] nicctl not found and unable to detect from hostname. Skipping RDMA QoS configuration."
Expand All @@ -124,3 +138,4 @@ fi
export PYTHONPATH=/sgl-workspace/aiter:${PYTHONPATH}


set +x
34 changes: 34 additions & 0 deletions benchmarks/multi_node/amd_utils/models.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -222,3 +222,37 @@ DeepSeek-R1-0528-MXFP4:
max_running_requests: 128
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-128"

DeepSeek-R1-0528-MXFP4-v2:
base_flags: "--decode-log-interval 1000 --log-level warning --watchdog-timeout 3600 --ep-dispatch-algorithm fake --load-balance-method round_robin --kv-cache-dtype fp8_e4m3 --attention-backend aiter --disaggregation-transfer-backend mori"
mtp_flags: "--speculative-draft-model-path SGLang/DeepSeek-R1-NextN --speculative-algorithm NEXTN --speculative-eagle-topk 1 --speculative-attention-mode decode "
dp_flags: "--moe-a2a-backend mori --deepep-mode normal --enable-dp-attention --moe-dense-tp-size 1 --enable-dp-lm-head --stream-interval 100 --tokenizer-worker-num 32 "
prefill:
mem_fraction_static: 0.8
disable_radix_cache: true
dp:
max_running_requests: 4096
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_PREFILL * PREFILL_TP_SIZE"
cuda_graph_bs: "1 2 3"
context_length: 9217
max_total_tokens: 131072
enable_two_batch_overlap: true
no_dp:
max_running_requests: 128
chunked_prefill_size: 16384
cuda_graph_bs_range: "1-128"
decode:
mem_fraction_static: 0.85
prefill_round_robin_balance: true
dp:
max_running_requests: 4096
chunked_prefill_size: "MORI_MAX_DISPATCH_TOKENS_DECODE * DECODE_TP_SIZE"
cuda_graph_bs_range: "1-512"
ep_only:
max_running_requests: 256
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-256"
no_dp:
max_running_requests: 128
chunked_prefill_size: 262144
cuda_graph_bs_range: "1-128"
56 changes: 33 additions & 23 deletions benchmarks/multi_node/amd_utils/server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,9 @@ no_dp = prefill.get('no_dp', {})
print(f'PREFILL_MAX_RUNNING_REQUESTS_DP=\"{dp.get(\"max_running_requests\", 24)}\"')
print(f'PREFILL_CHUNKED_PREFILL_SIZE_DP=\"{eval_formula(dp.get(\"chunked_prefill_size\", 262144))}\"')
print(f'PREFILL_CUDA_GRAPH_BS_DP=\"{dp.get(\"cuda_graph_bs\", \"1 2 3\")}\"')
print(f'PREFILL_CONTEXT_LENGTH_DP=\"{dp.get(\"context_length\", \"\")}\"')
print(f'PREFILL_MAX_TOTAL_TOKENS_DP=\"{dp.get(\"max_total_tokens\", \"\")}\"')
print(f'PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP=\"{dp.get(\"enable_two_batch_overlap\", False)}\"')
print(f'PREFILL_MAX_RUNNING_REQUESTS_NO_DP=\"{no_dp.get(\"max_running_requests\", 128)}\"')
print(f'PREFILL_CHUNKED_PREFILL_SIZE_NO_DP=\"{eval_formula(no_dp.get(\"chunked_prefill_size\", 262144))}\"')
s, e = parse_range(no_dp.get('cuda_graph_bs_range', '1-128'), 1, 128)
Expand Down Expand Up @@ -169,10 +172,16 @@ if [[ "$PREFILL_ENABLE_DP" == "true" ]]; then
prefill_cuda_graph_bs=($PREFILL_CUDA_GRAPH_BS_DP)
prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_DP
prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_DP
prefill_context_length=$PREFILL_CONTEXT_LENGTH_DP
prefill_max_total_tokens=$PREFILL_MAX_TOTAL_TOKENS_DP
prefill_enable_two_batch_overlap=$PREFILL_ENABLE_TWO_BATCH_OVERLAP_DP
else
prefill_cuda_graph_bs=($(seq $PREFILL_CUDA_GRAPH_BS_NO_DP_START $PREFILL_CUDA_GRAPH_BS_NO_DP_END))
prefill_max_running_requests=$PREFILL_MAX_RUNNING_REQUESTS_NO_DP
prefill_chunked_prefill_size=$PREFILL_CHUNKED_PREFILL_SIZE_NO_DP
prefill_context_length=""
prefill_max_total_tokens=""
prefill_enable_two_batch_overlap="false"
fi

# Compute DP-dependent decode parameters (3-way: DP > EP-only > no_dp)
Expand All @@ -187,29 +196,31 @@ else
decode_max_running_requests=$DECODE_MAX_RUNNING_REQUESTS_NO_DP
fi

# Use Decode configuration to configure different TP/DP size between P and D
PREFILL_DECODE_DIFFERENT_TP=""
if [[ "$PREFILL_ENABLE_DP" != "$DECODE_ENABLE_DP" ]]; then
if [[ "$DECODE_ENABLE_DP" == "true" ]]; then
PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp ${DECODE_TP_SIZE}"
else
PREFILL_DECODE_DIFFERENT_TP="--disaggregation-decode-tp ${DECODE_TP_SIZE} --disaggregation-decode-dp 1"
fi
fi

# Build the composed config strings (equivalent to the old MODEL_PREFILL_CONFIGS / MODEL_DECODE_CONFIGS)
PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} ${PREFILL_DECODE_DIFFERENT_TP}"
PREFILL_MODE_FLAGS="--mem-fraction-static ${PREFILL_MEM_FRACTION_STATIC} --max-running-requests ${prefill_max_running_requests} --chunked-prefill-size ${prefill_chunked_prefill_size} --cuda-graph-bs ${prefill_cuda_graph_bs[*]} "
if [[ "$PREFILL_DISABLE_RADIX_CACHE" == "True" ]] || [[ "$PREFILL_DISABLE_RADIX_CACHE" == "true" ]]; then
PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --disable-radix-cache"
fi
if [[ -n "$prefill_context_length" ]]; then
PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --context-length ${prefill_context_length}"
fi
if [[ -n "$prefill_max_total_tokens" ]]; then
PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --max-total-tokens ${prefill_max_total_tokens}"
fi
if [[ "$prefill_enable_two_batch_overlap" == "True" ]] || [[ "$prefill_enable_two_batch_overlap" == "true" ]]; then
PREFILL_MODE_FLAGS="$PREFILL_MODE_FLAGS --enable-two-batch-overlap"
PREFILL_SDMA_ENV="MORI_ENABLE_SDMA=true"
fi

DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]} "

DECODE_MODE_FLAGS="--mem-fraction-static ${DECODE_MEM_FRACTION_STATIC} --max-running-requests ${decode_max_running_requests} --cuda-graph-bs ${decode_cuda_graph_bs[*]}"
if [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "True" ]] || [[ "$DECODE_PREFILL_ROUND_ROBIN_BALANCE" == "true" ]]; then
DECODE_MODE_FLAGS="$DECODE_MODE_FLAGS --prefill-round-robin-balance"
fi

if [[ "$DECODE_MTP_SIZE" -gt 0 ]]; then
MORI_MAX_DISPATCH_TOKENS_DECODE=$((MORI_MAX_DISPATCH_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
MORI_MOE_MAX_INPUT_TOKENS_DECODE=$((MORI_MOE_MAX_INPUT_TOKENS_DECODE * (DECODE_MTP_SIZE + 1)))
fi

# =============================================================================
Expand Down Expand Up @@ -362,20 +373,21 @@ if [ "$NODE_RANK" -eq 0 ]; then
echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}, MTP size=${DECODE_MTP_SIZE}"
echo "Prefill servers ($((PREFILL_TP_SIZE/GPUS_PER_NODE)) nodes): ${PREFILL_ARGS}"
echo "Decode servers ($((DECODE_TP_SIZE/GPUS_PER_NODE)) nodes): ${DECODE_ARGS}"
echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK: ${MORI_MAX_DISPATCH_TOKENS_PREFILL}"
echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE}"
echo "Prefill env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL}"
echo "Decode env: SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} "
echo "Decode env: SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} "

echo "================================================"

# start the head prefill server
PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
PREFILL_CMD="${PREFILL_SDMA_ENV} SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
--model-path $MODEL_DIR/$MODEL_NAME \
--disaggregation-mode prefill \
--disaggregation-ib-device ${IBDEVICES} \
--host 0.0.0.0 \
--port 8000 \
--trust-remote-code \
${PREFILL_SERVER_CONFIG} \
--log-level-http warning"
${PREFILL_SERVER_CONFIG} "

if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
PREFILL_CMD="$PREFILL_CMD --dist-init-addr ${PREFILL_HEADNODE_URLS[0]} --nnodes ${PREFILL_NODES_PER_WORKER} --node-rank 0"
Expand Down Expand Up @@ -589,15 +601,14 @@ elif [ "$NODE_RANK" -gt 0 ] && [ "$NODE_RANK" -lt "$NODE_OFFSET" ]; then
echo "Using prefill config: $PREFILL_SERVER_CONFIG"
echo "Prefill parallelism: TP=${PREFILL_TP_SIZE}, EP enabled: ${PREFILL_ENABLE_EP}, DP enabled: ${PREFILL_ENABLE_DP}"

PREFILL_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
PREFILL_CMD="${PREFILL_SDMA_ENV} SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_PREFILL} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_PREFILL} python3 -m sglang.launch_server \
--model-path $MODEL_DIR/${MODEL_NAME} \
--disaggregation-mode prefill \
--disaggregation-ib-device ${IBDEVICES} \
--host 0.0.0.0 \
--port 8000 \
--trust-remote-code \
${PREFILL_SERVER_CONFIG} \
--log-level-http warning"
${PREFILL_SERVER_CONFIG} "

if [ "$PREFILL_NODES_PER_WORKER" -gt 1 ]; then
rank=$((NODE_RANK % PREFILL_NODES_PER_WORKER))
Expand Down Expand Up @@ -652,15 +663,14 @@ else
echo "Decode node rank: $RANK"
echo "Decode parallelism: TP=${DECODE_TP_SIZE}, EP enabled: ${DECODE_ENABLE_EP}, DP enabled: ${DECODE_ENABLE_DP}"

DECODE_CMD="SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
DECODE_CMD="SGLANG_MORI_MOE_MAX_INPUT_TOKENS=${MORI_MOE_MAX_INPUT_TOKENS_DECODE} SGLANG_MORI_NUM_MAX_DISPATCH_TOKENS_PER_RANK=${MORI_MAX_DISPATCH_TOKENS_DECODE} python3 -m sglang.launch_server \
--model-path ${MODEL_DIR}/${MODEL_NAME} \
--disaggregation-mode decode \
--disaggregation-ib-device ${IBDEVICES} \
--host 0.0.0.0 \
--port 8000 \
--trust-remote-code \
${DECODE_SERVER_CONFIG} \
--log-level-http warning"
${DECODE_SERVER_CONFIG} "

if [ "$DECODE_NODES_PER_WORKER" -gt 1 ]; then
rank=$((RANK % DECODE_NODES_PER_WORKER))
Expand Down
11 changes: 11 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2087,3 +2087,14 @@
- "Dynamic scheduler-recv-interval: 30 for CONC>4, 10 otherwise"
- "Remove --max-running-requests, reduce prefill/chunked from 81920 to 16384"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1018

- config-keys:
- dsr1-fp4-mi355x-sglang-disagg
- dsr1-fp4-mi355x-sglang-disagg-mtp
description:
- "Bump SGL mori image to lmsysorg/sglang-rocm"
- "Add more high tput / low latency sweep configs"
- "Enable v2 mxfp4 DSR1 0528 model"
- "Enable fp4 disp / fp8 combine feature on mori"
- "Enable Mori SDMA + two batch overlapping feature"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1236
Loading