From 072922d98b78a69c56ae47bde1b4d2f1072ad4e3 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Tue, 20 Jan 2026 23:03:48 +0000 Subject: [PATCH 1/6] feat: update AMD vLLM configs to v0.14.0 upstream - Update gptoss-fp4-mi300x-vllm, gptoss-fp4-mi325x-vllm, and gptoss-fp4-mi355x-vllm to use vllm/vllm-openai-rocm:v0.14.0 - Remove --async-scheduling flag (now default in v0.14.0) vLLM 0.14.0 key improvements for ROCm: - Async scheduling enabled by default (throughput improvement) - AITER RMSNorm fusion - MTP for AITER MLA - PyTorch 2.9.1 required Closes #492 Co-authored-by: Bryan Shan --- .github/configs/amd-master.yaml | 6 +++--- benchmarks/gptoss_fp4_mi300x_docker.sh | 3 +-- benchmarks/gptoss_fp4_mi325x_docker.sh | 3 +-- benchmarks/gptoss_fp4_mi355x_docker.sh | 3 +-- 4 files changed, 6 insertions(+), 9 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 4596806b6..5f8a2f966 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -113,7 +113,7 @@ dsr1-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } gptoss-fp4-mi300x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + image: vllm/vllm-openai-rocm:v0.14.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi300x @@ -144,7 +144,7 @@ gptoss-fp4-mi300x-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi325x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + image: vllm/vllm-openai-rocm:v0.14.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi325x @@ -175,7 +175,7 @@ gptoss-fp4-mi325x-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi355x-vllm: - image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 + image: vllm/vllm-openai-rocm:v0.14.0 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi355x diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index e66004f9d..9bf66b43f 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -40,8 +40,7 @@ vllm serve $MODEL --port $PORT \ --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ --block-size=64 \ --no-enable-prefix-caching \ ---disable-log-requests \ ---async-scheduling > $SERVER_LOG 2>&1 & +--disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 8b7dd5c87..c46121ed3 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -39,8 +39,7 @@ vllm serve $MODEL --port $PORT \ --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ --block-size=64 \ --no-enable-prefix-caching \ ---disable-log-requests \ ---async-scheduling > $SERVER_LOG 2>&1 & +--disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index bff0be872..dedd2f0d2 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -36,8 +36,7 @@ vllm serve $MODEL --port $PORT \ --config config.yaml \ --block-size=64 \ --no-enable-prefix-caching \ ---disable-log-requests \ ---async-scheduling > $SERVER_LOG 2>&1 & +--disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! From 4633384784d0dda90e96cd8742873e7dc95359aa Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Tue, 20 Jan 2026 23:10:05 +0000 Subject: [PATCH 2/6] fix: remove deprecated vLLM 0.14 CLI arguments from benchmark scripts Remove --max-seq-len-to-capture and --async-scheduling arguments that were deprecated/removed in vLLM 0.14: - --max-seq-len-to-capture: removed in favor of automatic detection - --async-scheduling: now the default behavior, flag removed Co-Authored-By: Claude Opus 4.5 --- benchmarks/gptoss_fp4_mi300x_docker.sh | 1 - benchmarks/gptoss_fp4_mi300x_slurm.sh | 2 -- benchmarks/gptoss_fp4_mi325x_docker.sh | 1 - benchmarks/gptoss_fp4_mi325x_slurm.sh | 2 -- benchmarks/gptoss_fp4_mi355x_docker.sh | 1 - benchmarks/gptoss_fp4_mi355x_slurm.sh | 4 +--- 6 files changed, 1 insertion(+), 10 deletions(-) diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 9bf66b43f..84c272fbe 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -36,7 +36,6 @@ vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ --block-size=64 \ --no-enable-prefix-caching \ diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index ca7b7fc2f..3dbfe6c9b 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -40,12 +40,10 @@ vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling \ > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index c46121ed3..877136aa7 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -35,7 +35,6 @@ vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ --block-size=64 \ --no-enable-prefix-caching \ diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index 4b8532aa6..5ce159962 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -39,12 +39,10 @@ vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ --compilation-config '{"cudagraph_mode": "FULL_AND_PIECEWISE"}' \ --block-size=64 \ --no-enable-prefix-caching \ --disable-log-requests \ ---async-scheduling \ > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index dedd2f0d2..1b6019f20 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -32,7 +32,6 @@ vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ --config config.yaml \ --block-size=64 \ --no-enable-prefix-caching \ diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 18aa13fff..bc1f7985e 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -33,12 +33,10 @@ vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ ---max-seq-len-to-capture $MAX_MODEL_LEN \ --config config.yaml \ --block-size=64 \ --no-enable-prefix-caching \ ---disable-log-requests \ ---async-scheduling > $SERVER_LOG 2>&1 & +--disable-log-requests > $SERVER_LOG 2>&1 & SERVER_PID=$! From c752cd899ff82ad9f3fe70afc5c5d9fa7070a887 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Tue, 20 Jan 2026 23:16:04 +0000 Subject: [PATCH 3/6] fix: set HIP_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ The vLLM 0.14.0 ROCm image includes a newer Ray version that requires HIP_VISIBLE_DEVICES to be set instead of ROCR_VISIBLE_DEVICES for GPU visibility. Add conditional export to all AMD benchmark scripts to copy ROCR_VISIBLE_DEVICES to HIP_VISIBLE_DEVICES when present. Co-Authored-By: Claude Opus 4.5 --- benchmarks/gptoss_fp4_mi300x_docker.sh | 7 ++++++- benchmarks/gptoss_fp4_mi300x_slurm.sh | 5 +++++ benchmarks/gptoss_fp4_mi325x_docker.sh | 5 +++++ benchmarks/gptoss_fp4_mi325x_slurm.sh | 5 +++++ benchmarks/gptoss_fp4_mi355x_docker.sh | 5 +++++ benchmarks/gptoss_fp4_mi355x_slurm.sh | 5 +++++ 6 files changed, 31 insertions(+), 1 deletion(-) diff --git a/benchmarks/gptoss_fp4_mi300x_docker.sh b/benchmarks/gptoss_fp4_mi300x_docker.sh index 84c272fbe..31c768b8b 100644 --- a/benchmarks/gptoss_fp4_mi300x_docker.sh +++ b/benchmarks/gptoss_fp4_mi300x_docker.sh @@ -24,9 +24,14 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 -export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 +export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 export VLLM_ROCM_QUICK_REDUCE_QUANTIZATION=INT4 SERVER_LOG=$(mktemp /tmp/server-XXXXXX.log) diff --git a/benchmarks/gptoss_fp4_mi300x_slurm.sh b/benchmarks/gptoss_fp4_mi300x_slurm.sh index 3dbfe6c9b..66f3f47c8 100644 --- a/benchmarks/gptoss_fp4_mi300x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi300x_slurm.sh @@ -30,6 +30,11 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 diff --git a/benchmarks/gptoss_fp4_mi325x_docker.sh b/benchmarks/gptoss_fp4_mi325x_docker.sh index 877136aa7..ad6f9fdb7 100644 --- a/benchmarks/gptoss_fp4_mi325x_docker.sh +++ b/benchmarks/gptoss_fp4_mi325x_docker.sh @@ -24,6 +24,11 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 diff --git a/benchmarks/gptoss_fp4_mi325x_slurm.sh b/benchmarks/gptoss_fp4_mi325x_slurm.sh index 5ce159962..cbfd2e500 100644 --- a/benchmarks/gptoss_fp4_mi325x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi325x_slurm.sh @@ -30,6 +30,11 @@ if [[ "$version" == "" || $version -lt 177 ]]; then export HSA_NO_SCRATCH_RECLAIM=1 fi +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_TRITON_BF16_GEMM=0 diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 1b6019f20..91a376d30 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -21,6 +21,11 @@ EOF sleep 5 cat config.yaml +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index bc1f7985e..c5fc4393e 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -24,6 +24,11 @@ EOF sleep 5 cat config.yaml +# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ +if [ -n "$ROCR_VISIBLE_DEVICES" ]; then + export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" +fi + export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 From 45b16c9bb62714da6b2549ac1a3c233d66d2c299 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Wed, 21 Jan 2026 23:16:49 +0000 Subject: [PATCH 4/6] fix: scope PR to MI300X and MI325X only, add perf-changelog - Revert MI355X changes (infra not ready yet) - Add perf-changelog.yaml entry documenting vLLM v0.14.0 upgrade for MI300X/MI325X Co-authored-by: functionstackx --- .github/configs/amd-master.yaml | 2 +- benchmarks/gptoss_fp4_mi355x_docker.sh | 16 ++++++++++------ benchmarks/gptoss_fp4_mi355x_slurm.sh | 18 ++++++++++++------ perf-changelog.yaml | 10 ++++++++++ 4 files changed, 33 insertions(+), 13 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 5f8a2f966..1eeed7bf1 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -175,7 +175,7 @@ gptoss-fp4-mi325x-vllm: - { tp: 8, conc-start: 4, conc-end: 16 } gptoss-fp4-mi355x-vllm: - image: vllm/vllm-openai-rocm:v0.14.0 + image: rocm/7.0:rocm7.0_ubuntu_22.04_vllm_0.10.1_instinct_20250927_rc1 model: openai/gpt-oss-120b model-prefix: gptoss runner: mi355x diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 91a376d30..7c708ae62 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -21,11 +21,6 @@ EOF sleep 5 cat config.yaml -# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ -if [ -n "$ROCR_VISIBLE_DEVICES" ]; then - export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" -fi - export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 @@ -37,10 +32,12 @@ vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ +--max-seq-len-to-capture $MAX_MODEL_LEN \ --config config.yaml \ --block-size=64 \ --no-enable-prefix-caching \ ---disable-log-requests > $SERVER_LOG 2>&1 & +--disable-log-requests \ +--async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -58,3 +55,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index c5fc4393e..1e5d87dba 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -24,24 +24,23 @@ EOF sleep 5 cat config.yaml -# Set HIP_VISIBLE_DEVICES to match ROCR_VISIBLE_DEVICES for Ray compatibility in vLLM 0.14+ -if [ -n "$ROCR_VISIBLE_DEVICES" ]; then - export HIP_VISIBLE_DEVICES="$ROCR_VISIBLE_DEVICES" -fi - export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 +# +## Start up vllm server set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ --gpu-memory-utilization 0.95 \ --max-model-len $MAX_MODEL_LEN \ +--max-seq-len-to-capture $MAX_MODEL_LEN \ --config config.yaml \ --block-size=64 \ --no-enable-prefix-caching \ ---disable-log-requests > $SERVER_LOG 2>&1 & +--disable-log-requests \ +--async-scheduling > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -59,3 +58,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 5227dd9d2..71a44dabb 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -185,3 +185,13 @@ description: - Add internal AMD ATOM inference engine for DeepSeek R1 FP8, FP4 and GPTOSS FP4 Mi355X pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/419 + +- config-keys: + - gptoss-fp4-mi300x-vllm + - gptoss-fp4-mi325x-vllm + description: + - "Update AMD MI300X and MI325X GPT-OSS 120B vLLM to use upstream ROCm image vllm/vllm-openai-rocm:v0.14.0" + - "Remove deprecated --async-scheduling flag (now enabled by default in vLLM v0.14.0)" + - "Remove deprecated --max-seq-len-to-capture flag" + - "Add HIP_VISIBLE_DEVICES env var for Ray compatibility in vLLM 0.14+" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/496 From cb8be6c3faea0a0416ada50fe4990bbdc4c312b3 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Wed, 21 Jan 2026 23:22:18 +0000 Subject: [PATCH 5/6] fix: remove MI355X changes from PR, scope to MI300X and MI325X only Revert changes to MI355X benchmark scripts as MI355X infrastructure is not ready yet. This PR should only contain changes for MI300X and MI325X vLLM v0.14.0 upgrade. Co-authored-by: functionstackx --- benchmarks/gptoss_fp4_mi355x_docker.sh | 7 ------- benchmarks/gptoss_fp4_mi355x_slurm.sh | 9 --------- 2 files changed, 16 deletions(-) diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index 7c708ae62..bff0be872 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -55,10 +55,3 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ - -# After throughput, run evaluation only if RUN_EVAL is true -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC - append_lm_eval_summary -fi -set +x diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 1e5d87dba..18aa13fff 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -28,8 +28,6 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 -# -## Start up vllm server set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -58,10 +56,3 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ - -# After throughput, run evaluation only if RUN_EVAL is true -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC - append_lm_eval_summary -fi -set +x From a7dd3a8a7d9b31a60c0aa15a249193f5523c9218 Mon Sep 17 00:00:00 2001 From: "claude[bot]" <41898282+claude[bot]@users.noreply.github.com> Date: Wed, 21 Jan 2026 23:23:05 +0000 Subject: [PATCH 6/6] fix: sync MI355X files with origin/main to remove PR diff Update MI355X benchmark scripts to match current origin/main state, which includes eval blocks from PR #258. This ensures zero diff for MI355X files in this PR. Co-authored-by: functionstackx --- benchmarks/gptoss_fp4_mi355x_docker.sh | 7 +++++++ benchmarks/gptoss_fp4_mi355x_slurm.sh | 9 +++++++++ 2 files changed, 16 insertions(+) diff --git a/benchmarks/gptoss_fp4_mi355x_docker.sh b/benchmarks/gptoss_fp4_mi355x_docker.sh index bff0be872..7c708ae62 100644 --- a/benchmarks/gptoss_fp4_mi355x_docker.sh +++ b/benchmarks/gptoss_fp4_mi355x_docker.sh @@ -55,3 +55,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x diff --git a/benchmarks/gptoss_fp4_mi355x_slurm.sh b/benchmarks/gptoss_fp4_mi355x_slurm.sh index 18aa13fff..1e5d87dba 100644 --- a/benchmarks/gptoss_fp4_mi355x_slurm.sh +++ b/benchmarks/gptoss_fp4_mi355x_slurm.sh @@ -28,6 +28,8 @@ export VLLM_USE_AITER_UNIFIED_ATTENTION=1 export VLLM_ROCM_USE_AITER_MHA=0 export VLLM_ROCM_USE_AITER_FUSED_MOE_A16W4=1 +# +## Start up vllm server set -x vllm serve $MODEL --port $PORT \ --tensor-parallel-size=$TP \ @@ -56,3 +58,10 @@ run_benchmark_serving \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir /workspace/ + +# After throughput, run evaluation only if RUN_EVAL is true +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" --concurrent-requests $CONC + append_lm_eval_summary +fi +set +x