diff --git a/.github/workflows/dsr1-tmpl.yml b/.github/workflows/dsr1-tmpl.yml index 3a48710f2..9852a4b84 100644 --- a/.github/workflows/dsr1-tmpl.yml +++ b/.github/workflows/dsr1-tmpl.yml @@ -114,9 +114,9 @@ jobs: secrets: inherit with: runner: mi300x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' + image: 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029' model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'sglang' + framework: 'vllm' precision: 'fp8' exp-name: ${{ inputs.exp-name }} isl: ${{ inputs.isl }} @@ -131,9 +131,9 @@ jobs: secrets: inherit with: runner: mi325x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915' + image: 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029' model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'sglang' + framework: 'vllm' precision: 'fp8' exp-name: ${{ inputs.exp-name }} isl: ${{ inputs.isl }} @@ -148,9 +148,9 @@ jobs: secrets: inherit with: runner: mi355x - image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915' + image: 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029' model: 'deepseek-ai/DeepSeek-R1-0528' - framework: 'sglang' + framework: 'vllm' precision: 'fp8' exp-name: ${{ inputs.exp-name }} isl: ${{ inputs.isl }} diff --git a/.github/workflows/runner-model-sweep-test.yml b/.github/workflows/runner-model-sweep-test.yml index 212ffc07c..fd482f035 100644 --- a/.github/workflows/runner-model-sweep-test.yml +++ b/.github/workflows/runner-model-sweep-test.yml @@ -212,7 +212,7 @@ jobs: - 'mi300x-oci_0' config: - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } + - { image: 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'vllm', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } name: '${{ matrix.runner }}' @@ -245,7 +245,7 @@ jobs: - 'mi325x-tw_3' config: - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi30x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } + - { image: 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'vllm', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } name: '${{ matrix.runner }}' @@ -278,7 +278,7 @@ jobs: config: - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-FP8-KV', framework: 'vllm', precision: 'fp8', exp-name: '70b_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'amd/Llama-3.3-70B-Instruct-MXFP4-Preview', framework: 'vllm', precision: 'fp4', exp-name: '70b_test' } - - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'sglang', precision: 'fp8', exp-name: 'dsr1_test' } + - { image: 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029', model: 'deepseek-ai/DeepSeek-R1-0528', framework: 'vllm', precision: 'fp8', exp-name: 'dsr1_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915', model: 'amd/DeepSeek-R1-0528-MXFP4-Preview', framework: 'sglang', precision: 'fp4', exp-name: 'dsr1_test' } - { image: 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1', model: 'openai/gpt-oss-120b', framework: 'vllm', precision: 'fp4', exp-name: 'gptoss_test' } diff --git a/.github/workflows/runner-sweep-test.yml b/.github/workflows/runner-sweep-test.yml index 6a1b4d4e8..c4ce8da2b 100644 --- a/.github/workflows/runner-sweep-test.yml +++ b/.github/workflows/runner-sweep-test.yml @@ -36,6 +36,7 @@ on: - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915' - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250915' - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' + - 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029' - 'vllm/vllm-openai:v0.10.2' model: diff --git a/.github/workflows/runner-test.yml b/.github/workflows/runner-test.yml index e0fcbaf3b..086dfa32c 100644 --- a/.github/workflows/runner-test.yml +++ b/.github/workflows/runner-test.yml @@ -67,6 +67,7 @@ on: - 'rocm/7.0:rocm7.0_ubuntu_22.04_sgl-dev-v0.5.2-rocm7.0-mi35x-20250915' - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250915' - 'rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1' + - 'rocm/7.x-preview:rocm7.2_preview_ubuntu_22.04_vlm_0.10.1_instinct_20251029' - 'vllm/vllm-openai:v0.10.2' model: description: 'Model' diff --git a/benchmarks/dsr1_fp8_mi300x_docker.sh b/benchmarks/dsr1_fp8_mi300x_docker.sh index fca44bcf1..4f70e38c3 100644 --- a/benchmarks/dsr1_fp8_mi300x_docker.sh +++ b/benchmarks/dsr1_fp8_mi300x_docker.sh @@ -9,8 +9,6 @@ # CONC # MAX_MODEL_LEN -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-sglang-deepseek-r1-fp8.html#run-the-inference-benchmark # If the machine runs a MEC FW older than 177, RCCL # cannot reclaim some memory. @@ -22,15 +20,35 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi -export SGLANG_USE_AITER=1 + +max_model_len=16384 # Must be >= the input + output length +max_seq_len_to_capture=10240 # Beneficial to set this to max_model_len +max_num_seqs=1024 +max_num_batched_tokens=131072 # Smaller values may result in better TTFT but worse TPOT / Throughput + +export VLLM_USE_V1=1 +export VLLM_USE_AITER_TRITON_ROPE=1 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_RMSNORM=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + + +export VLLM_ROCM_USE_AITER_TRITON_FUSED_RMSNORM_FP8_QUANT=1 +export VLLM_ROCM_USE_AITER_TRITON_FUSED_MUL_ADD=1 +export VLLM_ROCM_USE_AITER_TRITON_FUSED_SHARED_EXPERTS=0 set -x -python3 -m sglang.launch_server \ ---model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \ ---tensor-parallel-size=$TP \ ---mem-fraction-static=0.8 \ ---cuda-graph-max-bs=128 \ ---chunked-prefill-size=196608 \ ---num-continuous-decode-steps=4 \ ---max-prefill-tokens=196608 \ ---disable-radix-cache +vllm serve ${MODEL} \ + --host localhost \ + --port $PORT \ + --swap-space 64 \ + --tensor-parallel-size $TP \ + --max-num-seqs ${max_num_seqs} \ + --no-enable-prefix-caching \ + --max-num-batched-tokens ${max_num_batched_tokens} \ + --max-model-len ${max_model_len} \ + --block-size 1 \ + --gpu-memory-utilization 0.95 \ + --max-seq-len-to-capture ${max_seq_len_to_capture} \ + --async-scheduling \ + --kv-cache-dtype auto diff --git a/benchmarks/dsr1_fp8_mi300x_slurm.sh b/benchmarks/dsr1_fp8_mi300x_slurm.sh index c1c4276c2..e26df00a6 100644 --- a/benchmarks/dsr1_fp8_mi300x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi300x_slurm.sh @@ -20,8 +20,6 @@ huggingface-cli download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=8888 -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-sglang-deepseek-r1-fp8.html#run-the-inference-benchmark # If the machine runs a MEC FW older than 177, RCCL # cannot reclaim some memory. @@ -33,24 +31,43 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi -export SGLANG_USE_AITER=1 +max_model_len=16384 # Must be >= the input + output length +max_seq_len_to_capture=10240 # Beneficial to set this to max_model_len +max_num_seqs=1024 +max_num_batched_tokens=131072 # Smaller values may result in better TTFT but worse TPOT / Throughput + +export VLLM_USE_V1=1 +export VLLM_USE_AITER_TRITON_ROPE=1 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_RMSNORM=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + + +export VLLM_ROCM_USE_AITER_TRITON_FUSED_RMSNORM_FP8_QUANT=1 +export VLLM_ROCM_USE_AITER_TRITON_FUSED_MUL_ADD=1 +export VLLM_ROCM_USE_AITER_TRITON_FUSED_SHARED_EXPERTS=0 set -x -python3 -m sglang.launch_server \ ---model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \ ---tensor-parallel-size=$TP \ ---mem-fraction-static=0.8 \ ---cuda-graph-max-bs=128 \ ---chunked-prefill-size=196608 \ ---num-continuous-decode-steps=4 \ ---max-prefill-tokens=196608 \ ---disable-radix-cache \ +vllm serve ${MODEL} \ + --host localhost \ + --port $PORT \ + --swap-space 64 \ + --tensor-parallel-size $TP \ + --max-num-seqs ${max_num_seqs} \ + --no-enable-prefix-caching \ + --max-num-batched-tokens ${max_num_batched_tokens} \ + --max-model-len ${max_model_len} \ + --block-size 1 \ + --gpu-memory-utilization 0.95 \ + --max-seq-len-to-capture ${max_seq_len_to_capture} \ + --async-scheduling \ + --kv-cache-dtype auto \ > $SERVER_LOG 2>&1 & set +x while IFS= read -r line; do printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then + if [[ "$line" =~ Application\ startup\ complete ]]; then break fi done < <(tail -F -n0 "$SERVER_LOG") diff --git a/benchmarks/dsr1_fp8_mi325x_docker.sh b/benchmarks/dsr1_fp8_mi325x_docker.sh index f39a8dbbd..ebb2eb4c8 100644 --- a/benchmarks/dsr1_fp8_mi325x_docker.sh +++ b/benchmarks/dsr1_fp8_mi325x_docker.sh @@ -9,20 +9,33 @@ # CONC # MAX_MODEL_LEN -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-docker/benchmark-docker/inference-sglang-deepseek-r1-fp8.html +max_model_len=16384 # Must be >= the input + output length +max_seq_len_to_capture=10240 # Beneficial to set this to max_model_len +max_num_seqs=1024 +max_num_batched_tokens=131072 # Smaller values may result in better TTFT but worse TPOT / Throughput -export SGLANG_USE_AITER=1 +export VLLM_USE_V1=1 +export VLLM_USE_AITER_TRITON_ROPE=1 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_RMSNORM=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 -python3 -m sglang.launch_server \ - --model-path $MODEL \ - --host=0.0.0.0 \ +export VLLM_ROCM_USE_AITER_TRITON_FUSED_RMSNORM_FP8_QUANT=1 +export VLLM_ROCM_USE_AITER_TRITON_FUSED_MUL_ADD=1 +export VLLM_ROCM_USE_AITER_TRITON_FUSED_SHARED_EXPERTS=0 + +vllm serve ${MODEL} \ + --host localhost \ --port $PORT \ + --swap-space 64 \ --tensor-parallel-size $TP \ - --trust-remote-code \ - --chunked-prefill-size 196608 \ - --mem-fraction-static 0.8 --disable-radix-cache \ - --num-continuous-decode-steps 4 \ - --max-prefill-tokens 196608 \ - --cuda-graph-max-bs 128 + --max-num-seqs ${max_num_seqs} \ + --no-enable-prefix-caching \ + --max-num-batched-tokens ${max_num_batched_tokens} \ + --max-model-len ${max_model_len} \ + --block-size 1 \ + --gpu-memory-utilization 0.95 \ + --max-seq-len-to-capture ${max_seq_len_to_capture} \ + --async-scheduling \ + --kv-cache-dtype auto diff --git a/benchmarks/dsr1_fp8_mi325x_slurm.sh b/benchmarks/dsr1_fp8_mi325x_slurm.sh index acbe78d08..a00a8cc3b 100644 --- a/benchmarks/dsr1_fp8_mi325x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi325x_slurm.sh @@ -6,33 +6,43 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=8888 huggingface-cli download $MODEL -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-sglang-deepseek-r1-fp8.html#run-the-inference-benchmark +max_model_len=16384 # Must be >= the input + output length +max_seq_len_to_capture=10240 # Beneficial to set this to max_model_len +max_num_seqs=1024 +max_num_batched_tokens=131072 # Smaller values may result in better TTFT but worse TPOT / Throughput -export SGLANG_USE_AITER=1 +export VLLM_USE_V1=1 +export VLLM_USE_AITER_TRITON_ROPE=1 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_RMSNORM=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + + +export VLLM_ROCM_USE_AITER_TRITON_FUSED_RMSNORM_FP8_QUANT=1 +export VLLM_ROCM_USE_AITER_TRITON_FUSED_MUL_ADD=1 +export VLLM_ROCM_USE_AITER_TRITON_FUSED_SHARED_EXPERTS=0 set -x -python3 -m sglang.launch_server \ ---model-path=$MODEL --host=0.0.0.0 --port=$PORT --trust-remote-code \ ---tensor-parallel-size=$TP \ ---mem-fraction-static=0.8 \ ---cuda-graph-max-bs=128 \ ---chunked-prefill-size=196608 \ ---num-continuous-decode-steps=4 \ ---max-prefill-tokens=196608 \ ---disable-radix-cache \ +vllm serve ${MODEL} \ + --host localhost \ + --port $PORT \ + --swap-space 64 \ + --tensor-parallel-size $TP \ + --max-num-seqs ${max_num_seqs} \ + --no-enable-prefix-caching \ + --max-num-batched-tokens ${max_num_batched_tokens} \ + --max-model-len ${max_model_len} \ + --block-size 1 \ + --gpu-memory-utilization 0.95 \ + --max-seq-len-to-capture ${max_seq_len_to_capture} \ + --async-scheduling \ + --kv-cache-dtype auto \ > $SERVER_LOG 2>&1 & set +x while IFS= read -r line; do printf '%s\n' "$line" - if [[ "$line" =~ [Ee][Rr][Rr][Oo][Rr] ]]; then - sleep 5 - tail -n100 "$SERVER_LOG" - echo "JOB $SLURM_JOB_ID ran on $SLURMD_NODENAME" - exit 1 - fi - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then + if [[ "$line" =~ Application\ startup\ complete ]]; then break fi done < <(tail -F -n0 "$SERVER_LOG") diff --git a/benchmarks/dsr1_fp8_mi355x_docker.sh b/benchmarks/dsr1_fp8_mi355x_docker.sh index f39a8dbbd..d2448c786 100644 --- a/benchmarks/dsr1_fp8_mi355x_docker.sh +++ b/benchmarks/dsr1_fp8_mi355x_docker.sh @@ -8,21 +8,33 @@ # TP # CONC # MAX_MODEL_LEN +max_model_len=16384 # Must be >= the input + output length +max_seq_len_to_capture=10240 # Beneficial to set this to max_model_len +max_num_seqs=1024 +max_num_batched_tokens=131072 # Smaller values may result in better TTFT but worse TPOT / Throughput -# Reference -# https://rocm.docs.amd.com/en/docs-7.0-docker/benchmark-docker/inference-sglang-deepseek-r1-fp8.html +export VLLM_USE_V1=1 +export VLLM_USE_AITER_TRITON_ROPE=1 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_RMSNORM=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 -export SGLANG_USE_AITER=1 +export VLLM_ROCM_USE_AITER_TRITON_FUSED_RMSNORM_FP8_QUANT=1 +export VLLM_ROCM_USE_AITER_TRITON_FUSED_MUL_ADD=1 +export VLLM_ROCM_USE_AITER_TRITON_FUSED_SHARED_EXPERTS=1 -python3 -m sglang.launch_server \ - --model-path $MODEL \ - --host=0.0.0.0 \ + +vllm serve ${MODEL} \ + --host localhost \ --port $PORT \ + --swap-space 64 \ --tensor-parallel-size $TP \ - --trust-remote-code \ - --chunked-prefill-size 196608 \ - --mem-fraction-static 0.8 --disable-radix-cache \ - --num-continuous-decode-steps 4 \ - --max-prefill-tokens 196608 \ - --cuda-graph-max-bs 128 - + --max-num-seqs ${max_num_seqs} \ + --no-enable-prefix-caching \ + --max-num-batched-tokens ${max_num_batched_tokens} \ + --max-model-len ${max_model_len} \ + --block-size 1 \ + --gpu-memory-utilization 0.95 \ + --max-seq-len-to-capture ${max_seq_len_to_capture} \ + --async-scheduling \ + --kv-cache-dtype auto \ No newline at end of file diff --git a/benchmarks/dsr1_fp8_mi355x_slurm.sh b/benchmarks/dsr1_fp8_mi355x_slurm.sh index bf5d60e9c..07e3b6f3d 100644 --- a/benchmarks/dsr1_fp8_mi355x_slurm.sh +++ b/benchmarks/dsr1_fp8_mi355x_slurm.sh @@ -14,28 +14,44 @@ # RESULT_FILENAME export HF_MODULES_CACHE="/tmp/hf_modules_cache/" -export SGLANG_USE_AITER=1 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) -set -x -python3 -m sglang.launch_server \ - --model-path $MODEL \ - --host=0.0.0.0 \ +max_model_len=16384 # Must be >= the input + output length +max_seq_len_to_capture=10240 # Beneficial to set this to max_model_len +max_num_seqs=1024 +max_num_batched_tokens=131072 # Smaller values may result in better TTFT but worse TPOT / Throughput + +export VLLM_USE_V1=1 +export VLLM_USE_AITER_TRITON_ROPE=1 +export VLLM_ROCM_USE_AITER=1 +export VLLM_ROCM_USE_AITER_RMSNORM=1 +export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 + +export VLLM_ROCM_USE_AITER_TRITON_FUSED_RMSNORM_FP8_QUANT=1 +export VLLM_ROCM_USE_AITER_TRITON_FUSED_MUL_ADD=1 +export VLLM_ROCM_USE_AITER_TRITON_FUSED_SHARED_EXPERTS=1 + + +vllm serve ${MODEL} \ + --host localhost \ --port $PORT \ + --swap-space 64 \ --tensor-parallel-size $TP \ - --trust-remote-code \ - --chunked-prefill-size 196608 \ - --mem-fraction-static 0.8 \ - --disable-radix-cache \ - --num-continuous-decode-steps 4 \ - --max-prefill-tokens 196608 \ - --cuda-graph-max-bs 128 > $SERVER_LOG 2>&1 & + --max-num-seqs ${max_num_seqs} \ + --no-enable-prefix-caching \ + --max-num-batched-tokens ${max_num_batched_tokens} \ + --max-model-len ${max_model_len} \ + --block-size 1 \ + --gpu-memory-utilization 0.95 \ + --max-seq-len-to-capture ${max_seq_len_to_capture} \ + --async-scheduling \ + --kv-cache-dtype auto > $SERVER_LOG 2>&1 & set +x while IFS= read -r line; do printf '%s\n' "$line" - if [[ "$line" == *"The server is fired up and ready to roll"* ]]; then + if [[ "$line" =~ Application\ startup\ complete ]]; then break fi done < <(tail -F -n0 "$SERVER_LOG") diff --git a/runners/launch_mi300x-amd.sh b/runners/launch_mi300x-amd.sh index 51e059d4c..e4fa0804f 100644 --- a/runners/launch_mi300x-amd.sh +++ b/runners/launch_mi300x-amd.sh @@ -31,6 +31,16 @@ while IFS= read -r line; do fi done < <(docker logs -f --tail=0 $server_name 2>&1) +if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then + if [[ "$OSL" == "8192" ]]; then + NUM_PROMPTS=$(( CONC * 20 )) + else + NUM_PROMPTS=$(( CONC * 50 )) + fi +else + NUM_PROMPTS=$(( CONC * 10 )) +fi + git clone https://github.com/kimbochen/bench_serving.git set -x @@ -43,7 +53,7 @@ bench_serving/benchmark_serving.py \ --model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ --dataset-name=random \ --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ +--num-prompts=$NUM_PROMPTS \ --max-concurrency=$CONC \ --request-rate=inf --ignore-eos \ --save-result --percentile-metrics="ttft,tpot,itl,e2el" \ diff --git a/runners/launch_mi325x-amd.sh b/runners/launch_mi325x-amd.sh index 91b9bfad3..c521c809f 100644 --- a/runners/launch_mi325x-amd.sh +++ b/runners/launch_mi325x-amd.sh @@ -31,6 +31,16 @@ while IFS= read -r line; do fi done < <(docker logs -f --tail=0 $server_name 2>&1) +if [[ "$MODEL" == "amd/DeepSeek-R1-0528-MXFP4-Preview" || "$MODEL" == "deepseek-ai/DeepSeek-R1-0528" ]]; then + if [[ "$OSL" == "8192" ]]; then + NUM_PROMPTS=$(( CONC * 20 )) + else + NUM_PROMPTS=$(( CONC * 50 )) + fi +else + NUM_PROMPTS=$(( CONC * 10 )) +fi + git clone https://github.com/kimbochen/bench_serving.git set -x @@ -43,7 +53,7 @@ bench_serving/benchmark_serving.py \ --model=$MODEL --backend=vllm --base-url=http://$server_name:$PORT \ --dataset-name=random \ --random-input-len=$ISL --random-output-len=$OSL --random-range-ratio=$RANDOM_RANGE_RATIO \ ---num-prompts=$(( $CONC * 10 )) \ +--num-prompts=$NUM_PROMPTS \ --max-concurrency=$CONC \ --request-rate=inf --ignore-eos \ --save-result --percentile-metrics="ttft,tpot,itl,e2el" \ diff --git a/runners/launch_mi355x-amd.sh b/runners/launch_mi355x-amd.sh index 87ee8cbd2..753ffe337 100644 --- a/runners/launch_mi355x-amd.sh +++ b/runners/launch_mi355x-amd.sh @@ -13,6 +13,7 @@ # GITHUB_WORKSPACE # RESULT_FILENAME # HF_TOKEN +sudo sh -c 'echo 0 > /proc/sys/kernel/numa_balancing' HF_HUB_CACHE_MOUNT="/nfsdata/hf_hub_cache-1/" # Temp solution PORT=8888