From d66409b34679f62dfe8892ecbac8cfc2b0517623 Mon Sep 17 00:00:00 2001 From: jiacao-amd Date: Wed, 29 Apr 2026 11:28:27 -0500 Subject: [PATCH] Adjust MiniMax block size for TP8 EP8 --- benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh index 53cffceee..f5979dedb 100755 --- a/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh +++ b/benchmarks/single_node/minimaxm2.5_fp8_mi355x.sh @@ -27,6 +27,13 @@ fi export VLLM_ROCM_USE_AITER=1 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=1 +VLLM_BLOCK_SIZE=16 + +if [[ "$TP" == "8" && "$EP_SIZE" == "8" ]]; then + export VLLM_ROCM_SHUFFLE_KV_CACHE_LAYOUT=0 + VLLM_BLOCK_SIZE=32 + echo "Disabling shuffle KV cache layout and using block size 32 for TP8/EP8." +fi SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} @@ -52,7 +59,7 @@ $EP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ --kv-cache-dtype fp8 \ ---block-size=32 \ +--block-size=$VLLM_BLOCK_SIZE \ --no-enable-prefix-caching \ --attention-backend "ROCM_AITER_FA" \ --trust-remote-code > $SERVER_LOG 2>&1 &