diff --git a/benchmarks/70b_fp4_mi355x_docker.sh b/benchmarks/70b_fp4_mi355x_docker.sh index e24470f7e..c2927bbcf 100644 --- a/benchmarks/70b_fp4_mi355x_docker.sh +++ b/benchmarks/70b_fp4_mi355x_docker.sh @@ -13,14 +13,14 @@ export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then export VLLM_ROCM_USE_AITER_MHA=0 - if [[ "$CONC" -lt "16" ]]; then + if [[ "$CONC" -le "16" ]]; then export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 else export VLLM_TRITON_FP4_GEMM_USE_ASM=1 fi elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then export VLLM_ROCM_USE_AITER_MHA=0 - if [[ "$CONC" -lt "16" ]]; then + if [[ "$CONC" -le "16" ]]; then export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 else export VLLM_TRITON_FP4_GEMM_USE_ASM=1 diff --git a/benchmarks/70b_fp4_mi355x_slurm.sh b/benchmarks/70b_fp4_mi355x_slurm.sh index 549f231d8..1a8b86017 100644 --- a/benchmarks/70b_fp4_mi355x_slurm.sh +++ b/benchmarks/70b_fp4_mi355x_slurm.sh @@ -20,14 +20,14 @@ export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then export VLLM_ROCM_USE_AITER_MHA=0 - if [[ "$CONC" -lt "16" ]]; then + if [[ "$CONC" -le "16" ]]; then export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 else export VLLM_TRITON_FP4_GEMM_USE_ASM=1 fi elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then export VLLM_ROCM_USE_AITER_MHA=0 - if [[ "$CONC" -lt "16" ]]; then + if [[ "$CONC" -le "16" ]]; then export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 else export VLLM_TRITON_FP4_GEMM_USE_ASM=1 diff --git a/benchmarks/70b_fp8_mi300x_docker.sh b/benchmarks/70b_fp8_mi300x_docker.sh index e95696ab6..568b42777 100644 --- a/benchmarks/70b_fp8_mi300x_docker.sh +++ b/benchmarks/70b_fp8_mi300x_docker.sh @@ -38,7 +38,7 @@ set -x vllm serve $MODEL --port=$PORT \ --swap-space=64 \ --gpu-memory-utilization=0.94 \ ---dtype=auto --kv-cache-dtype=fp8 \ +--dtype=float16 --kv-cache-dtype=fp8 \ --distributed-executor-backend=mp --tensor-parallel-size=$TP \ --max-model-len=$MAX_MODEL_LEN \ --max-seq-len-to-capture=$MAX_MODEL_LEN \ diff --git a/benchmarks/70b_fp8_mi300x_slurm.sh b/benchmarks/70b_fp8_mi300x_slurm.sh index 3d9d33d6d..141a1ea6c 100644 --- a/benchmarks/70b_fp8_mi300x_slurm.sh +++ b/benchmarks/70b_fp8_mi300x_slurm.sh @@ -49,7 +49,7 @@ set -x vllm serve $MODEL --port=$PORT \ --swap-space=64 \ --gpu-memory-utilization=0.94 \ ---dtype=auto --kv-cache-dtype=fp8 \ +--dtype=float16 --kv-cache-dtype=fp8 \ --distributed-executor-backend=mp --tensor-parallel-size=$TP \ --max-model-len=$MAX_MODEL_LEN \ --max-seq-len-to-capture=$MAX_MODEL_LEN \ diff --git a/benchmarks/70b_fp8_mi325x_docker.sh b/benchmarks/70b_fp8_mi325x_docker.sh index 49bd9f01c..a0de76137 100644 --- a/benchmarks/70b_fp8_mi325x_docker.sh +++ b/benchmarks/70b_fp8_mi325x_docker.sh @@ -26,7 +26,7 @@ set -x vllm serve $MODEL --port=$PORT \ --swap-space=64 \ --gpu-memory-utilization=0.94 \ ---dtype=auto --kv-cache-dtype=fp8 \ +--dtype=float16 --kv-cache-dtype=fp8 \ --distributed-executor-backend=mp --tensor-parallel-size=$TP \ --max-model-len=$MAX_MODEL_LEN \ --max-seq-len-to-capture=$MAX_MODEL_LEN \ diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh index fb889a6b7..6d3461318 100644 --- a/benchmarks/70b_fp8_mi325x_slurm.sh +++ b/benchmarks/70b_fp8_mi325x_slurm.sh @@ -38,7 +38,7 @@ set -x vllm serve $MODEL --port=$PORT \ --swap-space=64 \ --gpu-memory-utilization=0.94 \ ---dtype=auto --kv-cache-dtype=fp8 \ +--dtype=float16 --kv-cache-dtype=fp8 \ --distributed-executor-backend=mp --tensor-parallel-size=$TP \ --max-model-len=$MAX_MODEL_LEN \ --max-seq-len-to-capture=$MAX_MODEL_LEN \