diff --git a/benchmarks/70b_fp4_mi355x_docker.sh b/benchmarks/70b_fp4_mi355x_docker.sh index c2927bbcf..3336b6225 100644 --- a/benchmarks/70b_fp4_mi355x_docker.sh +++ b/benchmarks/70b_fp4_mi355x_docker.sh @@ -28,6 +28,13 @@ elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then if [[ "$CONC" -gt "16" ]]; then export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 + fi + if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then + export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 + else + export VLLM_TRITON_FP4_GEMM_USE_ASM=1 fi fi diff --git a/benchmarks/70b_fp4_mi355x_slurm.sh b/benchmarks/70b_fp4_mi355x_slurm.sh index 1a8b86017..83b0ff367 100644 --- a/benchmarks/70b_fp4_mi355x_slurm.sh +++ b/benchmarks/70b_fp4_mi355x_slurm.sh @@ -33,9 +33,16 @@ elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then export VLLM_TRITON_FP4_GEMM_USE_ASM=1 fi elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then - if [[ "$CONC" -gt "16" ]]; then - export VLLM_ROCM_USE_AITER_MHA=1 - fi + if [[ "$CONC" -gt "16" ]]; then + export VLLM_ROCM_USE_AITER_MHA=1 + else + export VLLM_ROCM_USE_AITER_MHA=0 + fi + if [[ "$CONC" -lt "16" && "$TP" -gt "1" ]]; then + export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 + else + export VLLM_TRITON_FP4_GEMM_USE_ASM=1 + fi fi diff --git a/benchmarks/70b_fp8_mi300x_docker.sh b/benchmarks/70b_fp8_mi300x_docker.sh index 568b42777..9d616bf24 100644 --- a/benchmarks/70b_fp8_mi300x_docker.sh +++ b/benchmarks/70b_fp8_mi300x_docker.sh @@ -34,6 +34,8 @@ elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi fi +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). set -x vllm serve $MODEL --port=$PORT \ --swap-space=64 \ diff --git a/benchmarks/70b_fp8_mi300x_slurm.sh b/benchmarks/70b_fp8_mi300x_slurm.sh index 141a1ea6c..9581c88fa 100644 --- a/benchmarks/70b_fp8_mi300x_slurm.sh +++ b/benchmarks/70b_fp8_mi300x_slurm.sh @@ -45,6 +45,8 @@ elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi fi +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). set -x vllm serve $MODEL --port=$PORT \ --swap-space=64 \ diff --git a/benchmarks/70b_fp8_mi325x_docker.sh b/benchmarks/70b_fp8_mi325x_docker.sh index a0de76137..1bcc7b183 100644 --- a/benchmarks/70b_fp8_mi325x_docker.sh +++ b/benchmarks/70b_fp8_mi325x_docker.sh @@ -22,6 +22,8 @@ elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi fi +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). set -x vllm serve $MODEL --port=$PORT \ --swap-space=64 \ diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh index 6d3461318..1b19f9edb 100644 --- a/benchmarks/70b_fp8_mi325x_slurm.sh +++ b/benchmarks/70b_fp8_mi325x_slurm.sh @@ -34,6 +34,8 @@ elif [[ "$ISL" == "8192" && "$OSL" == "1024" ]]; then fi fi +# In this specific case, float16 performs better than the datatype +# picked by vllm when using auto for --dtype (bfloat16). set -x vllm serve $MODEL --port=$PORT \ --swap-space=64 \