SemiAnalysisAI · cquil11 · Oct 13, 2025 · Oct 9, 2025
diff --git a/benchmarks/70b_fp8_mi300x_docker.sh b/benchmarks/70b_fp8_mi300x_docker.sh
@@ -17,6 +17,11 @@
 # Disable that features to avoid crashes.
 # This is related to the changes in the driver at:
 # https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
+
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
 version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
 if [[ "$version" == "" || $version -lt 177 ]]; then
   export HSA_NO_SCRATCH_RECLAIM=1
@@ -49,5 +54,6 @@ vllm serve $MODEL --port=$PORT \
 --max-num-seqs=$CONC \
 --max-num-batched-tokens=131072 \
 --no-enable-prefix-caching \
+--config config.yaml \
 --async-scheduling \
 --disable-log-requests
diff --git a/benchmarks/70b_fp8_mi300x_slurm.sh b/benchmarks/70b_fp8_mi300x_slurm.sh
@@ -28,6 +28,11 @@ PORT=8888
 # Disable that features to avoid crashes.
 # This is related to the changes in the driver at:
 # https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates
+
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
 version=`rocm-smi --showfw | grep MEC | head -n 1 |  awk '{print $NF}'`
 if [[ "$version" == "" || $version -lt 177 ]]; then
   export HSA_NO_SCRATCH_RECLAIM=1
@@ -60,6 +65,7 @@ vllm serve $MODEL --port=$PORT \
 --max-num-seqs=$CONC \
 --max-num-batched-tokens=131072 \
 --no-enable-prefix-caching \
+--config config.yaml \
 --async-scheduling \
 --disable-log-requests \
 > $SERVER_LOG 2>&1 &

diff --git a/benchmarks/70b_fp8_mi325x_docker.sh b/benchmarks/70b_fp8_mi325x_docker.sh
@@ -12,6 +12,11 @@
 # Reference
 # https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
 
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
+
 if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
     export VLLM_ROCM_USE_AITER_MHA=0
 elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
@@ -43,5 +48,6 @@ vllm serve $MODEL --port=$PORT \
 --max-num-seqs=$CONC \
 --max-num-batched-tokens=131072 \
 --no-enable-prefix-caching \
+--config config.yaml \
 --async-scheduling \
 --disable-log-requests
diff --git a/benchmarks/70b_fp8_mi325x_slurm.sh b/benchmarks/70b_fp8_mi325x_slurm.sh
@@ -24,6 +24,10 @@ PORT=$(( 8888 + $PORT_OFFSET ))
 # Reference
 # https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
 
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
 if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
     export VLLM_ROCM_USE_AITER_MHA=0
 elif [[ "$ISL" == "1024" && "$OSL" == "8192" ]]; then
@@ -55,6 +59,7 @@ vllm serve $MODEL --port=$PORT \
 --max-num-seqs=$CONC \
 --max-num-batched-tokens=131072 \
 --no-enable-prefix-caching \
+--config config.yaml \
 --async-scheduling \
 --disable-log-requests \
 > $SERVER_LOG 2>&1 &

diff --git a/benchmarks/70b_fp8_mi355x_docker.sh b/benchmarks/70b_fp8_mi355x_docker.sh
@@ -12,6 +12,13 @@
 # Reference
 # https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
 
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
+sleep 5
+cat config.yaml
+
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 
 if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
@@ -37,6 +44,7 @@ vllm serve $MODEL --port=$PORT \
 --max-num-seqs=$CONC \
 --max-num-batched-tokens=131072 \
 --no-enable-prefix-caching \
+--config config.yaml \
 --async-scheduling \
 --disable-log-requests
 
diff --git a/benchmarks/70b_fp8_mi355x_slurm.sh b/benchmarks/70b_fp8_mi355x_slurm.sh
@@ -18,6 +18,10 @@ SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log)
 # Reference
 # https://rocm.docs.amd.com/en/docs-7.0-rc1/preview/benchmark-docker/inference-vllm-llama-3.3-70b-fp8.html#run-the-inference-benchmark
 
+cat > config.yaml << EOF
+compilation-config: '{"custom_ops": ["-rms_norm", "-quant_fp8", "-silu_and_mul"]}'
+EOF
+
 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4
 
 if [[ "$ISL" == "1024" && "$OSL" == "1024" ]]; then
@@ -43,6 +47,7 @@ vllm serve $MODEL --port=$PORT \
 --max-num-seqs=$CONC \
 --max-num-batched-tokens=131072 \
 --no-enable-prefix-caching \
+--config config.yaml \
 --async-scheduling \
 --disable-log-requests \
 > $SERVER_LOG 2>&1 &