diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 4596806b6..1eeed7bf1 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -113,7 +113,7 @@ dsr1-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } gptoss-fp4-mi300x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + image: vllm/vllm-openai-rocm:v0.14.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi300x @@ -144,7 +144,7 @@ gptoss-fp4-mi300x-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi325x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + image: vllm/vllm-openai-rocm:v0.14.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi325x diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 467a32a58..4a4fb5f89 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -24,9 +24,14 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 -export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 +export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) @@ -36,12 +41,10 @@ vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ --block-size=64 \ --no-enable-prefix-caching \ ---disable-log-requests \ ---async-scheduling > $SERVER_LOG 2>&1 & +--disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index bc385c264..ac251d99c 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -30,6 +30,11 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 @@ -42,12 +47,10 @@ vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling \ > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 054f6c377..ad22cda35 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -24,6 +24,11 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 @@ -35,12 +40,10 @@ vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ --block-size=64 \ --no-enable-prefix-caching \ ---disable-log-requests \ ---async-scheduling > $SERVER_LOG 2>&1 & +--disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index c0c9597c2..03d131285 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -30,6 +30,11 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 @@ -39,12 +44,11 @@ vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling > $SERVER_LOG 2>&1 & +> $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5227dd9d2..71a44dabb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -185,3 +185,13 @@ description: - Add internal AMD ATOM inference engine for DeepSeek R1 FP8, FP4 and GPTOSS FP4 Mi355X pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/419 + +- config-keys: + - gptoss-fp4-mi300x-vllm + - gptoss-fp4-mi325x-vllm + description: + - "Update AMD MI300X and MI325X GPT-OSS 120B vLLM to use upstream ROCm image vllm/vllm-openai-rocm:v0.14.0" + - "Remove deprecated --async-scheduling flag (now enabled by default in vLLM v0.14.0)" + - "Remove deprecated --max-seq-len-to-capture flag" + - "Add HIP_VISIBLE_DEVICES env var for Ray compatibility in vLLM 0.14+" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/496