From 76d50710d488437fb5cfc470db1ddf1e55632bf0 Mon Sep 17 00:00:00 2001 From: Quentin Colombet Date: Sun, 28 Sep 2025 22:00:21 +0200 Subject: [PATCH 1/4] Explain the use of float16 for the dtype option --- benchmarks/70b_fp8_mi300x_docker.sh | 2 ++ benchmarks/70b_fp8_mi300x_slurm.sh | 2 ++ benchmarks/70b_fp8_mi325x_docker.sh | 2 ++ benchmarks/70b_fp8_mi325x_slurm.sh | 2 ++ 4 files changed, 8 insertions(+) diff --git a/benchmarks/70b_fp8_mi300x_docker.sh b/benchmarks/70b_fp8_mi300x_docker.sh index 568b42777..9d616bf24 100644 --- a/benchmarks/70b_fp8_mi300x_docker.sh +++ b/benchmarks/70b_fp8_mi300x_docker.sh @@ -34,6 +34,8 @@ elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi fi +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). set -x vllm serve $MODEL --port=$PORT \ --swap-space=64 \ diff --git a/benchmarks/70b_fp8_mi300x_slurm.sh b/benchmarks/70b_fp8_mi300x_slurm.sh index 141a1ea6c..9581c88fa 100644 --- a/benchmarks/70b_fp8_mi300x_slurm.sh +++ b/benchmarks/70b_fp8_mi300x_slurm.sh @@ -45,6 +45,8 @@ elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi fi +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). set -x vllm serve $MODEL --port=$PORT \ --swap-space=64 \ diff --git a/benchmarks/70b_fp8_mi325x_docker.sh b/benchmarks/70b_fp8_mi325x_docker.sh index a0de76137..1bcc7b183 100644 --- a/benchmarks/70b_fp8_mi325x_docker.sh +++ b/benchmarks/70b_fp8_mi325x_docker.sh @@ -22,6 +22,8 @@ elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi fi +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). set -x vllm serve $MODEL --port=$PORT \ --swap-space=64 \ diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh index 6d3461318..1b19f9edb 100644 --- a/benchmarks/70b_fp8_mi325x_slurm.sh +++ b/benchmarks/70b_fp8_mi325x_slurm.sh @@ -34,6 +34,8 @@ elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi fi +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). set -x vllm serve $MODEL --port=$PORT \ --swap-space=64 \ From 95a499e317ec858dce68d198b8a3b2c492311d85 Mon Sep 17 00:00:00 2001 From: Quentin Colombet Date: Sun, 28 Sep 2025 22:13:07 +0200 Subject: [PATCH 2/4] [mi355][70b] Explicitly set VLLM_ROCM_USE_AITER_MHA The default for the environment variable `VLLM_ROCM_USE_AITER_MHA` changed in the 09/27 RC1 docker. Set the variable explicitly to prevent some performance differences. --- benchmarks/70b_fp4_mi355x_docker.sh | 2 ++ benchmarks/70b_fp4_mi355x_slurm.sh | 2 ++ 2 files changed, 4 insertions(+) diff --git a/benchmarks/70b_fp4_mi355x_docker.sh b/benchmarks/70b_fp4_mi355x_docker.sh index c2927bbcf..258c0bc76 100644 --- a/benchmarks/70b_fp4_mi355x_docker.sh +++ b/benchmarks/70b_fp4_mi355x_docker.sh @@ -28,6 +28,8 @@ elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then if [[ "$CONC" -gt "16" ]]; then export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 fi fi diff --git a/benchmarks/70b_fp4_mi355x_slurm.sh b/benchmarks/70b_fp4_mi355x_slurm.sh index 1a8b86017..6f991abe0 100644 --- a/benchmarks/70b_fp4_mi355x_slurm.sh +++ b/benchmarks/70b_fp4_mi355x_slurm.sh @@ -35,6 +35,8 @@ elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then if [[ "$CONC" -gt "16" ]]; then export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 fi fi From eca8d224d6165835e0212e5bb852732bdb3dee18 Mon Sep 17 00:00:00 2001 From: Jeremy Arnold <103538711+JArnoldAMD@users.noreply.github.com> Date: Sun, 28 Sep 2025 16:33:18 -0500 Subject: [PATCH 3/4] Update 70b_fp4_mi355x_docker.sh envvars --- benchmarks/70b_fp4_mi355x_docker.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/70b_fp4_mi355x_docker.sh b/benchmarks/70b_fp4_mi355x_docker.sh index 258c0bc76..3336b6225 100644 --- a/benchmarks/70b_fp4_mi355x_docker.sh +++ b/benchmarks/70b_fp4_mi355x_docker.sh @@ -31,6 +31,11 @@ elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then else export VLLM_ROCM_USE_AITER_MHA=0 fi + if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then + export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 + else + export VLLM_TRITON_FP4_GEMM_USE_ASM=1 + fi fi set -x From 39dafc4497a80ecd4914b27eef70304821f2123e Mon Sep 17 00:00:00 2001 From: Jeremy Arnold <103538711+JArnoldAMD@users.noreply.github.com> Date: Sun, 28 Sep 2025 16:33:52 -0500 Subject: [PATCH 4/4] Update 70b_fp4_mi355x_slurm.sh envvars --- benchmarks/70b_fp4_mi355x_slurm.sh | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/benchmarks/70b_fp4_mi355x_slurm.sh b/benchmarks/70b_fp4_mi355x_slurm.sh index 6f991abe0..83b0ff367 100644 --- a/benchmarks/70b_fp4_mi355x_slurm.sh +++ b/benchmarks/70b_fp4_mi355x_slurm.sh @@ -33,11 +33,16 @@ elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then export VLLM_TRITON_FP4_GEMM_USE_ASM=1 fi elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -gt "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - else - export VLLM_ROCM_USE_AITER_MHA=0 - fi + if [[ "$CONC" -gt "16" ]]; then + export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 + fi + if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then + export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 + else + export VLLM_TRITON_FP4_GEMM_USE_ASM=1 + fi fi