diff --git a/benchmarks/70b_fp8_mi300x_docker.sh b/benchmarks/70b_fp8_mi300x_docker.sh index a711cb0c8..96f26c5c0 100644 --- a/benchmarks/70b_fp8_mi300x_docker.sh +++ b/benchmarks/70b_fp8_mi300x_docker.sh @@ -17,6 +17,11 @@ # Disable that features to avoid crashes. # This is related to the changes in the driver at: # https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates + +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 @@ -49,5 +54,6 @@ vllm serve $MODEL --port=$PORT \ --max-num-seqs=$CONC \ --max-num-batched-tokens=131072 \ --no-enable-prefix-caching \ +--config config.yaml \ --async-scheduling \ --disable-log-requests diff --git a/benchmarks/70b_fp8_mi300x_slurm.sh b/benchmarks/70b_fp8_mi300x_slurm.sh index 188abb905..6576997d4 100644 --- a/benchmarks/70b_fp8_mi300x_slurm.sh +++ b/benchmarks/70b_fp8_mi300x_slurm.sh @@ -28,6 +28,11 @@ PORT=8888 # Disable that features to avoid crashes. # This is related to the changes in the driver at: # https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates + +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 @@ -60,6 +65,7 @@ vllm serve $MODEL --port=$PORT \ --max-num-seqs=$CONC \ --max-num-batched-tokens=131072 \ --no-enable-prefix-caching \ +--config config.yaml \ --async-scheduling \ --disable-log-requests \ > $SERVER_LOG 2>&1 & diff --git a/benchmarks/70b_fp8_mi325x_docker.sh b/benchmarks/70b_fp8_mi325x_docker.sh index 7cffface5..46cc90216 100644 --- a/benchmarks/70b_fp8_mi325x_docker.sh +++ b/benchmarks/70b_fp8_mi325x_docker.sh @@ -12,6 +12,11 @@ # Reference # https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + + if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then export VLLM_ROCM_USE_AITER_MHA=0 elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then @@ -43,5 +48,6 @@ vllm serve $MODEL --port=$PORT \ --max-num-seqs=$CONC \ --max-num-batched-tokens=131072 \ --no-enable-prefix-caching \ +--config config.yaml \ --async-scheduling \ --disable-log-requests diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh index 205418c80..7dc29420c 100644 --- a/benchmarks/70b_fp8_mi325x_slurm.sh +++ b/benchmarks/70b_fp8_mi325x_slurm.sh @@ -24,6 +24,10 @@ PORT=$(( 8888 + $PORT_OFFSET )) # Reference # https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then export VLLM_ROCM_USE_AITER_MHA=0 elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then @@ -55,6 +59,7 @@ vllm serve $MODEL --port=$PORT \ --max-num-seqs=$CONC \ --max-num-batched-tokens=131072 \ --no-enable-prefix-caching \ +--config config.yaml \ --async-scheduling \ --disable-log-requests \ > $SERVER_LOG 2>&1 & diff --git a/benchmarks/70b_fp8_mi355x_docker.sh b/benchmarks/70b_fp8_mi355x_docker.sh index 8e3361fe8..1936be724 100644 --- a/benchmarks/70b_fp8_mi355x_docker.sh +++ b/benchmarks/70b_fp8_mi355x_docker.sh @@ -12,6 +12,13 @@ # Reference # https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + +sleep 5 +cat config.yaml + export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then @@ -37,6 +44,7 @@ vllm serve $MODEL --port=$PORT \ --max-num-seqs=$CONC \ --max-num-batched-tokens=131072 \ --no-enable-prefix-caching \ +--config config.yaml \ --async-scheduling \ --disable-log-requests diff --git a/benchmarks/70b_fp8_mi355x_slurm.sh b/benchmarks/70b_fp8_mi355x_slurm.sh index 0a610366f..e1ea15c57 100644 --- a/benchmarks/70b_fp8_mi355x_slurm.sh +++ b/benchmarks/70b_fp8_mi355x_slurm.sh @@ -18,6 +18,10 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) # Reference # https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark +cat > config.yaml << EOF +compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}' +EOF + export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then @@ -43,6 +47,7 @@ vllm serve $MODEL --port=$PORT \ --max-num-seqs=$CONC \ --max-num-batched-tokens=131072 \ --no-enable-prefix-caching \ +--config config.yaml \ --async-scheduling \ --disable-log-requests \ > $SERVER_LOG 2>&1 &