Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions .github/configs/amd-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ qwen3.5-bf16-mi355x-sglang:
- { tp: 8, conc-start: 4, conc-end: 64 }

qwen3.5-bf16-mi300x-sglang:
image: lmsysorg/sglang:v0.5.9-rocm720-mi30x
image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
model: Qwen/Qwen3.5-397B-A17B
model-prefix: qwen3.5
runner: mi300x
Expand All @@ -150,7 +150,7 @@ qwen3.5-bf16-mi300x-sglang:
- { tp: 8, conc-start: 4, conc-end: 64 }

qwen3.5-bf16-mi325x-sglang:
image: lmsysorg/sglang:v0.5.9-rocm720-mi30x
image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
model: Qwen/Qwen3.5-397B-A17B
model-prefix: qwen3.5
runner: mi325x
Expand All @@ -168,7 +168,7 @@ qwen3.5-bf16-mi325x-sglang:
- { tp: 8, conc-start: 4, conc-end: 64 }

qwen3.5-fp8-mi325x-sglang:
image: lmsysorg/sglang:v0.5.9-rocm720-mi30x
image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
model: Qwen/Qwen3.5-397B-A17B-FP8
model-prefix: qwen3.5
runner: mi325x
Expand Down Expand Up @@ -224,7 +224,7 @@ qwen3.5-fp4-mi355x-sglang:
- { tp: 4, conc-start: 4, conc-end: 32 }

qwen3.5-fp8-mi300x-sglang:
image: lmsysorg/sglang:v0.5.9-rocm720-mi30x
image: lmsysorg/sglang:v0.5.10-rocm720-mi30x
model: Qwen/Qwen3.5-397B-A17B-FP8
model-prefix: qwen3.5
runner: mi300x
Expand Down
15 changes: 12 additions & 3 deletions benchmarks/single_node/qwen3.5_bf16_mi300x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,34 @@ hf download "$MODEL"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
CONTEXT_LENGTH=$((ISL + OSL + 20))
Comment thread
zhentaocc marked this conversation as resolved.
MAX_PREFILL_TOKENS=32768
Comment thread
zhentaocc marked this conversation as resolved.

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

# following Andy Luo linkedin's recipe https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/
python3 -m sglang.launch_server \
--attention-backend triton \
--attention-backend aiter \
Comment thread
chunfangamd marked this conversation as resolved.
--model-path $MODEL \
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
Comment thread
zhentaocc marked this conversation as resolved.
--data-parallel-size 1 \
Comment thread
zhentaocc marked this conversation as resolved.
--trust-remote-code \
--mem-fraction-static 0.8 \
--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
--tokenizer-worker-num 6 \
Comment thread
zhentaocc marked this conversation as resolved.
--enable-aiter-allreduce-fusion \
Comment thread
zhentaocc marked this conversation as resolved.
--cuda-graph-max-bs $CONC \
--disable-radix-cache \
--max-prefill-tokens $MAX_PREFILL_TOKENS \
--scheduler-recv-interval 30 \
--mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
Comment thread
zhentaocc marked this conversation as resolved.

SERVER_PID=$!

Expand Down
15 changes: 12 additions & 3 deletions benchmarks/single_node/qwen3.5_bf16_mi325x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,25 +19,34 @@ hf download "$MODEL"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
CONTEXT_LENGTH=$((ISL + OSL + 20))
MAX_PREFILL_TOKENS=32768

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

# following Andy Luo linkedin's recipe https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/
python3 -m sglang.launch_server \
--attention-backend triton \
--attention-backend aiter \
--model-path $MODEL \
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
--data-parallel-size 1 \
--trust-remote-code \
--mem-fraction-static 0.8 \
--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
--tokenizer-worker-num 6 \
--enable-aiter-allreduce-fusion \
Comment thread
zhentaocc marked this conversation as resolved.
--cuda-graph-max-bs $CONC \
--disable-radix-cache \
--max-prefill-tokens $MAX_PREFILL_TOKENS \
--scheduler-recv-interval 30 \
--mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
16 changes: 14 additions & 2 deletions benchmarks/single_node/qwen3.5_bf16_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ check_env_vars \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME
RESULT_FILENAME \
EP_SIZE

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
Expand All @@ -19,11 +20,14 @@ hf download "$MODEL"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
CONTEXT_LENGTH=$((ISL + OSL + 20))
MAX_PREFILL_TOKENS=32768

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
Expand All @@ -34,8 +38,16 @@ python3 -m sglang.launch_server \
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
--ep-size $EP_SIZE \
--data-parallel-size 1 \
--trust-remote-code \
--mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
--tokenizer-worker-num 6 \
--enable-aiter-allreduce-fusion \
--cuda-graph-max-bs $CONC \
--disable-radix-cache \
--max-prefill-tokens $MAX_PREFILL_TOKENS \
--scheduler-recv-interval 30 \
--mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
15 changes: 12 additions & 3 deletions benchmarks/single_node/qwen3.5_fp8_mi300x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,35 @@ hf download "$MODEL"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
CONTEXT_LENGTH=$((ISL + OSL + 20))
MAX_PREFILL_TOKENS=32768

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

# following AMD Andy linkedin's recipe
# https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/
python3 -m sglang.launch_server \
--attention-backend triton \
--attention-backend aiter \
--model-path $MODEL \
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
--data-parallel-size 1 \
--trust-remote-code \
--mem-fraction-static 0.8 \
--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
--tokenizer-worker-num 6 \
--enable-aiter-allreduce-fusion \
--cuda-graph-max-bs $CONC \
--disable-radix-cache \
--max-prefill-tokens $MAX_PREFILL_TOKENS \
--scheduler-recv-interval 30 \
--mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
15 changes: 12 additions & 3 deletions benchmarks/single_node/qwen3.5_fp8_mi325x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,26 +19,35 @@ hf download "$MODEL"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
CONTEXT_LENGTH=$((ISL + OSL + 20))
MAX_PREFILL_TOKENS=32768

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor

# following AMD Andy linkedin's recipe
# https://www.linkedin.com/feed/update/urn:li:activity:7429203734389280768/
python3 -m sglang.launch_server \
--attention-backend triton \
--attention-backend aiter \
--model-path $MODEL \
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
--data-parallel-size 1 \
--trust-remote-code \
--mem-fraction-static 0.8 \
--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
--tokenizer-worker-num 6 \
--enable-aiter-allreduce-fusion \
--cuda-graph-max-bs $CONC \
--disable-radix-cache \
--max-prefill-tokens $MAX_PREFILL_TOKENS \
--scheduler-recv-interval 30 \
--mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
16 changes: 14 additions & 2 deletions benchmarks/single_node/qwen3.5_fp8_mi355x.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@ check_env_vars \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME
RESULT_FILENAME \
EP_SIZE

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
Expand All @@ -19,11 +20,14 @@ hf download "$MODEL"

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}
CONTEXT_LENGTH=$((ISL + OSL + 20))
MAX_PREFILL_TOKENS=32768

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH"
fi
# Start GPU monitoring (power, temperature, clocks every second)
start_gpu_monitor
Expand All @@ -34,8 +38,16 @@ python3 -m sglang.launch_server \
--host=0.0.0.0 \
--port $PORT \
--tensor-parallel-size $TP \
--ep-size $EP_SIZE \
--data-parallel-size 1 \
--trust-remote-code \
--mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
--tokenizer-worker-num 6 \
--enable-aiter-allreduce-fusion \
--cuda-graph-max-bs $CONC \
--disable-radix-cache \
--max-prefill-tokens $MAX_PREFILL_TOKENS \
--scheduler-recv-interval 30 \
--mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

Expand Down
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1308,6 +1308,16 @@
- "Configs: 1k1k (TP4 conc 4-128), 8k1k (TP4 conc 4-128)"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/820

- config-keys:
- qwen3.5-bf16-mi300x-sglang
- qwen3.5-bf16-mi325x-sglang
- qwen3.5-fp8-mi300x-sglang
- qwen3.5-fp8-mi325x-sglang
description:
- "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI300X and MI325X to achieve better performance"
- "Use lmsysorg/sglang:v0.5.10-rocm720-mi30x"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/986

- config-keys:
- glm5-fp4-b200-sglang
description:
Expand Down
Loading