From 13f95599af6fe15d211ea750faa2261f371f96ca Mon Sep 17 00:00:00 2001 From: seungrokjung Date: Sat, 27 Sep 2025 22:19:06 -0500 Subject: [PATCH 1/2] 0927 final Signed-off-by: seungrokjung --- benchmarks/gptoss_fp4_mi300x_docker.sh | 2 -- benchmarks/gptoss_fp4_mi300x_slurm.sh | 2 -- benchmarks/gptoss_fp4_mi325x_docker.sh | 3 --- benchmarks/gptoss_fp4_mi325x_slurm.sh | 3 --- benchmarks/gptoss_fp4_mi355x_docker.sh | 2 -- benchmarks/gptoss_fp4_mi355x_slurm.sh | 3 +-- 6 files changed, 1 insertion(+), 14 deletions(-) diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 7dccbd805..66a8642bd 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -19,11 +19,9 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi -export NCCL_MIN_NCHANNELS=112 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 -export ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 set -x diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index d37b2654b..8b657a085 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -30,11 +30,9 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi -export NCCL_MIN_NCHANNELS=112 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 -export ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 set -x diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 147c0a84b..4cf1ebadf 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -9,12 +9,9 @@ # CONC # MAX_MODEL_LEN -export HSA_NO_SCRATCH_RECLAIM=1 -export NCCL_MIN_NCHANNELS=112 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 -export ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0 set -x vllm serve $MODEL --port $PORT \ diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index 7a26cde02..c1b5dbdcf 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -21,12 +21,9 @@ huggingface-cli download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=8888 -export HSA_NO_SCRATCH_RECLAIM=1 -export NCCL_MIN_NCHANNELS=112 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 -export ROCM_TRITON_MOE_PRESHUFFLE_SCALES=0 set -x vllm serve $MODEL --port $PORT \ diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index a2fc54bad..103e77fe3 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -16,8 +16,6 @@ EOF sleep 5 cat config.yaml -export HSA_NO_SCRATCH_RECLAIM=1 -export NCCL_MIN_NCHANNELS=112 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 867a26233..657bc1fdf 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -22,10 +22,9 @@ EOF sleep 5 cat config.yaml -export HSA_NO_SCRATCH_RECLAIM=1 -export NCCL_MIN_NCHANNELS=112 export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 +export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 set -x vllm serve $MODEL --port $PORT \ From 2a2208fc2b5fe548763b88f6c29256ce4a863322 Mon Sep 17 00:00:00 2001 From: seungrokjung Date: Sun, 28 Sep 2025 01:09:06 -0500 Subject: [PATCH 2/2] mi325; gfx942; compatible with old gpu fw nodes Signed-off-by: seungrokjung --- benchmarks/gptoss_fp4_mi325x_docker.sh | 10 ++++++++++ benchmarks/gptoss_fp4_mi325x_slurm.sh | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 4cf1ebadf..05250267f 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -9,6 +9,16 @@ # CONC # MAX_MODEL_LEN +# If the machine runs a MEC FW older than 177, RCCL +# cannot reclaim some memory. +# Disable that features to avoid crashes. +# This is related to the changes in the driver at: +# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index c1b5dbdcf..d89ed501c 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -21,6 +21,16 @@ huggingface-cli download $MODEL SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) PORT=8888 +# If the machine runs a MEC FW older than 177, RCCL +# cannot reclaim some memory. +# Disable that features to avoid crashes. +# This is related to the changes in the driver at: +# https://rocm.docs.amd.com/en/docs-6.4.3/about/release-notes.html#amdgpu-driver-updates +version=`rocm-smi --showfw | grep MEC | head -n 1 | awk '{print $NF}'` +if [[ "$version" == "" || $version -lt 177 ]]; then + export HSA_NO_SCRATCH_RECLAIM=1 +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0