Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions .github/workflows/dsr1-tmpl.yml
Original file line number Diff line number Diff line change
Expand Up @@ -114,9 +114,9 @@ jobs:
secrets: inherit
with:
runner: mi300x
image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915'
image: 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029'
model: 'deepseek-ai/DeepSeek-R1-0528'
framework: 'sglang'
framework: 'vllm'
precision: 'fp8'
exp-name: ${{ inputs.exp-name }}
isl: ${{ inputs.isl }}
Expand All @@ -131,9 +131,9 @@ jobs:
secrets: inherit
with:
runner: mi325x
image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915'
image: 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029'
model: 'deepseek-ai/DeepSeek-R1-0528'
framework: 'sglang'
framework: 'vllm'
precision: 'fp8'
exp-name: ${{ inputs.exp-name }}
isl: ${{ inputs.isl }}
Expand All @@ -148,9 +148,9 @@ jobs:
secrets: inherit
with:
runner: mi355x
image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915'
image: 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029'
model: 'deepseek-ai/DeepSeek-R1-0528'
framework: 'sglang'
framework: 'vllm'
precision: 'fp8'
exp-name: ${{ inputs.exp-name }}
isl: ${{ inputs.isl }}
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/runner-model-sweep-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -212,7 +212,7 @@ jobs:
- 'mi300x-oci_0'
config:
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
- { image: 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'vllm', precision: 'fp8', exp-name: 'dsr1_test' }
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }

name: '${{ matrix.runner }}'
Expand Down Expand Up @@ -245,7 +245,7 @@ jobs:
- 'mi325x-tw_3'
config:
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
- { image: 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'vllm', precision: 'fp8', exp-name: 'dsr1_test' }
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }

name: '${{ matrix.runner }}'
Expand Down Expand Up @@ -278,7 +278,7 @@ jobs:
config:
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' }
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' }
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' }
- { image: 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'vllm', precision: 'fp8', exp-name: 'dsr1_test' }
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' }
- { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' }

Expand Down
1 change: 1 addition & 0 deletions .github/workflows/runner-sweep-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ on:
- 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915'
- 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250915'
- 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
- 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029'
- 'vllm/vllm-openai:v0.10.2'

model:
Expand Down
1 change: 1 addition & 0 deletions .github/workflows/runner-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ on:
- 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915'
- 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250915'
- 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1'
- 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029'
- 'vllm/vllm-openai:v0.10.2'
model:
description: 'Model'
Expand Down
42 changes: 30 additions & 12 deletions benchmarks/dsr1_fp8_mi300x_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
# CONC
# MAX_MODEL_LEN

# Reference
# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-sglang-deepseek-r1-fp8.html#run-the-inference-benchmark

# If the machine runs a MEC FW older than 177, RCCL
# cannot reclaim some memory.
Expand All @@ -22,15 +20,35 @@ if [[ "$version" == "" || $version -lt 177 ]]; then
export HSA_NO_SCRATCH_RECLAIM=1
fi

export SGLANG_USE_AITER=1

max_model_len=16384 # Must be >= the input + output length
max_seq_len_to_capture=10240 # Beneficial to set this to max_model_len
max_num_seqs=1024
max_num_batched_tokens=131072 # Smaller values may result in better TTFT but worse TPOT / Throughput

export VLLM_USE_V1=1
export VLLM_USE_AITER_TRITON_ROPE=1
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_RMSNORM=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4


export VLLM_ROCM_USE_AITER_TRITON_FUSED_RMSNORM_FP8_QUANT=1
export VLLM_ROCM_USE_AITER_TRITON_FUSED_MUL_ADD=1
export VLLM_ROCM_USE_AITER_TRITON_FUSED_SHARED_EXPERTS=0

set -x
python3 -m sglang.launch_server \
--model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \
--tensor-parallel-size=$TP \
--mem-fraction-static=0.8 \
--cuda-graph-max-bs=128 \
--chunked-prefill-size=196608 \
--num-continuous-decode-steps=4 \
--max-prefill-tokens=196608 \
--disable-radix-cache
vllm serve ${MODEL} \
--host localhost \
--port $PORT \
--swap-space 64 \
--tensor-parallel-size $TP \
--max-num-seqs ${max_num_seqs} \
--no-enable-prefix-caching \
--max-num-batched-tokens ${max_num_batched_tokens} \
--max-model-len ${max_model_len} \
--block-size 1 \
--gpu-memory-utilization 0.95 \
--max-seq-len-to-capture ${max_seq_len_to_capture} \
--async-scheduling \
--kv-cache-dtype auto
43 changes: 30 additions & 13 deletions benchmarks/dsr1_fp8_mi300x_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,6 @@ huggingface-cli download $MODEL
SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
PORT=8888

# Reference
# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-sglang-deepseek-r1-fp8.html#run-the-inference-benchmark

# If the machine runs a MEC FW older than 177, RCCL
# cannot reclaim some memory.
Expand All @@ -33,24 +31,43 @@ if [[ "$version" == "" || $version -lt 177 ]]; then
export HSA_NO_SCRATCH_RECLAIM=1
fi

export SGLANG_USE_AITER=1
max_model_len=16384 # Must be >= the input + output length
max_seq_len_to_capture=10240 # Beneficial to set this to max_model_len
max_num_seqs=1024
max_num_batched_tokens=131072 # Smaller values may result in better TTFT but worse TPOT / Throughput

export VLLM_USE_V1=1
export VLLM_USE_AITER_TRITON_ROPE=1
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_RMSNORM=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4


export VLLM_ROCM_USE_AITER_TRITON_FUSED_RMSNORM_FP8_QUANT=1
export VLLM_ROCM_USE_AITER_TRITON_FUSED_MUL_ADD=1
export VLLM_ROCM_USE_AITER_TRITON_FUSED_SHARED_EXPERTS=0

set -x
python3 -m sglang.launch_server \
--model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \
--tensor-parallel-size=$TP \
--mem-fraction-static=0.8 \
--cuda-graph-max-bs=128 \
--chunked-prefill-size=196608 \
--num-continuous-decode-steps=4 \
--max-prefill-tokens=196608 \
--disable-radix-cache \
vllm serve ${MODEL} \
--host localhost \
--port $PORT \
--swap-space 64 \
--tensor-parallel-size $TP \
--max-num-seqs ${max_num_seqs} \
--no-enable-prefix-caching \
--max-num-batched-tokens ${max_num_batched_tokens} \
--max-model-len ${max_model_len} \
--block-size 1 \
--gpu-memory-utilization 0.95 \
--max-seq-len-to-capture ${max_seq_len_to_capture} \
--async-scheduling \
--kv-cache-dtype auto \
> $SERVER_LOG 2>&1 &

set +x
while IFS= read -r line; do
printf '%s\n' "$line"
if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then
if [[ "$line" =~ Application\ startup\ complete ]]; then
break
fi
done < <(tail -F -n0 "$SERVER_LOG")
Expand Down
37 changes: 25 additions & 12 deletions benchmarks/dsr1_fp8_mi325x_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,20 +9,33 @@
# CONC
# MAX_MODEL_LEN

# Reference
# https://rocm.docs.amd.com/en/docs-7.0-docker/benchmark-docker/inference-sglang-deepseek-r1-fp8.html
max_model_len=16384 # Must be >= the input + output length
max_seq_len_to_capture=10240 # Beneficial to set this to max_model_len
max_num_seqs=1024
max_num_batched_tokens=131072 # Smaller values may result in better TTFT but worse TPOT / Throughput

export SGLANG_USE_AITER=1
export VLLM_USE_V1=1
export VLLM_USE_AITER_TRITON_ROPE=1
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_RMSNORM=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4

python3 -m sglang.launch_server \
--model-path $MODEL \
--host=0.0.0.0 \
export VLLM_ROCM_USE_AITER_TRITON_FUSED_RMSNORM_FP8_QUANT=1
export VLLM_ROCM_USE_AITER_TRITON_FUSED_MUL_ADD=1
export VLLM_ROCM_USE_AITER_TRITON_FUSED_SHARED_EXPERTS=0

vllm serve ${MODEL} \
--host localhost \
--port $PORT \
--swap-space 64 \
--tensor-parallel-size $TP \
--trust-remote-code \
--chunked-prefill-size 196608 \
--mem-fraction-static 0.8 --disable-radix-cache \
--num-continuous-decode-steps 4 \
--max-prefill-tokens 196608 \
--cuda-graph-max-bs 128
--max-num-seqs ${max_num_seqs} \
--no-enable-prefix-caching \
--max-num-batched-tokens ${max_num_batched_tokens} \
--max-model-len ${max_model_len} \
--block-size 1 \
--gpu-memory-utilization 0.95 \
--max-seq-len-to-capture ${max_seq_len_to_capture} \
--async-scheduling \
--kv-cache-dtype auto

48 changes: 29 additions & 19 deletions benchmarks/dsr1_fp8_mi325x_slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,33 +6,43 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
PORT=8888
huggingface-cli download $MODEL

# Reference
# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-sglang-deepseek-r1-fp8.html#run-the-inference-benchmark
max_model_len=16384 # Must be >= the input + output length
max_seq_len_to_capture=10240 # Beneficial to set this to max_model_len
max_num_seqs=1024
max_num_batched_tokens=131072 # Smaller values may result in better TTFT but worse TPOT / Throughput

export SGLANG_USE_AITER=1
export VLLM_USE_V1=1
export VLLM_USE_AITER_TRITON_ROPE=1
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_RMSNORM=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4


export VLLM_ROCM_USE_AITER_TRITON_FUSED_RMSNORM_FP8_QUANT=1
export VLLM_ROCM_USE_AITER_TRITON_FUSED_MUL_ADD=1
export VLLM_ROCM_USE_AITER_TRITON_FUSED_SHARED_EXPERTS=0

set -x
python3 -m sglang.launch_server \
--model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \
--tensor-parallel-size=$TP \
--mem-fraction-static=0.8 \
--cuda-graph-max-bs=128 \
--chunked-prefill-size=196608 \
--num-continuous-decode-steps=4 \
--max-prefill-tokens=196608 \
--disable-radix-cache \
vllm serve ${MODEL} \
--host localhost \
--port $PORT \
--swap-space 64 \
--tensor-parallel-size $TP \
--max-num-seqs ${max_num_seqs} \
--no-enable-prefix-caching \
--max-num-batched-tokens ${max_num_batched_tokens} \
--max-model-len ${max_model_len} \
--block-size 1 \
--gpu-memory-utilization 0.95 \
--max-seq-len-to-capture ${max_seq_len_to_capture} \
--async-scheduling \
--kv-cache-dtype auto \
> $SERVER_LOG 2>&1 &

set +x
while IFS= read -r line; do
printf '%s\n' "$line"
if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then
sleep 5
tail -n100 "$SERVER_LOG"
echo "JOB $SLURM_JOB_ID ran on $SLURMD_NODENAME"
exit 1
fi
if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then
if [[ "$line" =~ Application\ startup\ complete ]]; then
break
fi
done < <(tail -F -n0 "$SERVER_LOG")
Expand Down
38 changes: 25 additions & 13 deletions benchmarks/dsr1_fp8_mi355x_docker.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,33 @@
# TP
# CONC
# MAX_MODEL_LEN
max_model_len=16384 # Must be >= the input + output length
max_seq_len_to_capture=10240 # Beneficial to set this to max_model_len
max_num_seqs=1024
max_num_batched_tokens=131072 # Smaller values may result in better TTFT but worse TPOT / Throughput

# Reference
# https://rocm.docs.amd.com/en/docs-7.0-docker/benchmark-docker/inference-sglang-deepseek-r1-fp8.html
export VLLM_USE_V1=1
export VLLM_USE_AITER_TRITON_ROPE=1
export VLLM_ROCM_USE_AITER=1
export VLLM_ROCM_USE_AITER_RMSNORM=1
export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4

export SGLANG_USE_AITER=1
export VLLM_ROCM_USE_AITER_TRITON_FUSED_RMSNORM_FP8_QUANT=1
export VLLM_ROCM_USE_AITER_TRITON_FUSED_MUL_ADD=1
export VLLM_ROCM_USE_AITER_TRITON_FUSED_SHARED_EXPERTS=1

python3 -m sglang.launch_server \
--model-path $MODEL \
--host=0.0.0.0 \

vllm serve ${MODEL} \
--host localhost \
--port $PORT \
--swap-space 64 \
--tensor-parallel-size $TP \
--trust-remote-code \
--chunked-prefill-size 196608 \
--mem-fraction-static 0.8 --disable-radix-cache \
--num-continuous-decode-steps 4 \
--max-prefill-tokens 196608 \
--cuda-graph-max-bs 128

--max-num-seqs ${max_num_seqs} \
--no-enable-prefix-caching \
--max-num-batched-tokens ${max_num_batched_tokens} \
--max-model-len ${max_model_len} \
--block-size 1 \
--gpu-memory-utilization 0.95 \
--max-seq-len-to-capture ${max_seq_len_to_capture} \
--async-scheduling \
--kv-cache-dtype auto
Loading