From 4905fb53f00b583d7ccd4b2781d500539f4b1bb2 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 03:58:09 -0500 Subject: [PATCH 01/19] feat: enhance Qwen benchmark scripts with additional parameters * Added CONTEXT_LENGTH and MAX_PREFILL_TOKENS variables for better configuration. * Updated launch_server command with new options: --tokenizer-worker-num, --enable-aiter-allreduce-fusion, --cuda-graph-max-bs, --context-length, --disable-radix-cache, --max-prefill-tokens, and --scheduler-recv-interval. --- benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 9 +++++++++ benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 9 +++++++++ 2 files changed, 18 insertions(+) diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index 701695def..b67ab63cf 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -19,6 +19,8 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +CONTEXT_LENGTH=$((ISL + OSL + 20)) +MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -35,6 +37,13 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ + --tokenizer-worker-num 6 \ + --enable-aiter-allreduce-fusion \ + --cuda-graph-max-bs $CONC \ + --context-length $CONTEXT_LENGTH \ + --disable-radix-cache \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --scheduler-recv-interval 30 \ --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index 701695def..b67ab63cf 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -19,6 +19,8 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} +CONTEXT_LENGTH=$((ISL + OSL + 20)) +MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -35,6 +37,13 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --trust-remote-code \ + --tokenizer-worker-num 6 \ + --enable-aiter-allreduce-fusion \ + --cuda-graph-max-bs $CONC \ + --context-length $CONTEXT_LENGTH \ + --disable-radix-cache \ + --max-prefill-tokens $MAX_PREFILL_TOKENS \ + --scheduler-recv-interval 30 \ --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! From 7bcd9397329978d55b836a44927d47754e83e925 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 03:59:46 -0500 Subject: [PATCH 02/19] Update perf-changelog.yaml to include new Qwen3.5 FP8 and BF16 SGLang benchmark configurations for MI355X, enhancing performance with updated CLI arguments. --- perf-changelog.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a3d7b5e3e..c5b7ad661 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1278,3 +1278,10 @@ - "New framework: dynamo-vllm (Dynamo frontend + vLLM backend)" - "Runner script updated to clone NVIDIA/srt-slurm and map vLLM container image" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1008 + +- config-keys: + - qwen3.5-fp8-mi355x-sglang + - qwen3.5-bf16-mi355x-sglang + description: + - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI355X to achieve better performance" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/942 From 45d74622b0e72f3c09a2f727457a852036dc0d61 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 03:59:46 -0500 Subject: [PATCH 03/19] Update SGLang image versions for Qwen3.5 configurations in amd-master.yaml to v0.5.9, ensuring compatibility with recent changes. --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 265dc48ca..66f748cb8 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -114,7 +114,7 @@ dsr1-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260215 + image: lmsysorg/sglang-rocm:v0.5.9-rocm720-mi35x-20260325 model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi355x @@ -186,7 +186,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: rocm/sgl-dev:v0.5.8.post1-rocm720-mi35x-20260218 + image: lmsysorg/sglang-rocm:v0.5.9-rocm720-mi35x-20260325 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x From 5d4bdf26b784828ea3960364f92719b662fbb161 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 03:59:46 -0500 Subject: [PATCH 04/19] use 0327 build --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 66f748cb8..7f065d4e2 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -114,7 +114,7 @@ dsr1-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.9-rocm720-mi35x-20260325 + image: lmsysorg/sglang-rocm:v0.5.9-rocm720-mi35x-20260327 model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi355x @@ -186,7 +186,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.9-rocm720-mi35x-20260325 + image: lmsysorg/sglang-rocm:v0.5.9-rocm720-mi35x-20260327 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x From ded63118bfabac429ba6a248b715bc37b07ff152 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 03:59:46 -0500 Subject: [PATCH 05/19] Update perf-changelog.yaml to reflect the new PR link for Qwen3.5 FP8 and BF16 SGLang benchmarks on MI355X, ensuring accurate tracking of performance enhancements. --- perf-changelog.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c5b7ad661..2a9914ef1 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1284,4 +1284,5 @@ - qwen3.5-bf16-mi355x-sglang description: - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI355X to achieve better performance" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/942 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/980 + From 87a8701d843bea2476be3468e284c66698f404a1 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 04:00:45 -0500 Subject: [PATCH 06/19] Update Qwen3.5 image tags in amd-master.yaml to v0.5.10rc0 for MI355X configurations and adjust perf-changelog.yaml to reflect the changes, ensuring accurate performance tracking and compatibility. --- .github/configs/amd-master.yaml | 4 ++-- perf-changelog.yaml | 4 +++- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 7f065d4e2..2345cac87 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -114,7 +114,7 @@ dsr1-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.9-rocm720-mi35x-20260327 + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260327 model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi355x @@ -186,7 +186,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.9-rocm720-mi35x-20260327 + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260327 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2a9914ef1..aa76394b3 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1282,7 +1282,9 @@ - config-keys: - qwen3.5-fp8-mi355x-sglang - qwen3.5-bf16-mi355x-sglang + - qwen3.5-fp8-mi355x-sglang description: - - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI355X to achieve better performance" + - "Fix MI355X Qwen 3.5 image tag: v0.5.9-rocm720-mi35x-20260327 is not on Docker Hub (404)" + - "Use lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260329" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/980 From 647a8287a7c50195ccb8ce2e6dbc4cc153adb0c4 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 04:00:45 -0500 Subject: [PATCH 07/19] Update Qwen3.5 FP8 and BF16 SGLang benchmark descriptions in perf-changelog.yaml to reflect improved CLI arguments for MI355X, ensuring better performance tracking. --- perf-changelog.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index aa76394b3..3d4fe494b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1284,7 +1284,7 @@ - qwen3.5-bf16-mi355x-sglang - qwen3.5-fp8-mi355x-sglang description: - - "Fix MI355X Qwen 3.5 image tag: v0.5.9-rocm720-mi35x-20260327 is not on Docker Hub (404)" - - "Use lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260329" + - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI355X to achieve better performance" + - "Use lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260327" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/980 From 56121f4d1641bbfdca9bec6e2dc05d7027336257 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 04:00:45 -0500 Subject: [PATCH 08/19] Enhance Qwen3.5 benchmark scripts for MI355X by adding EP_SIZE parameter and adjusting memory fraction. Updated launch_server command to include data-parallel-size and improved context length handling for better performance. --- benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 8 ++++++-- benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 8 ++++++-- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index b67ab63cf..f7a01a6d1 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -9,7 +9,8 @@ check_env_vars \ ISL \ OSL \ RANDOM_RANGE_RATIO \ - RESULT_FILENAME + RESULT_FILENAME \ + EP_SIZE if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -26,6 +27,7 @@ EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -36,6 +38,8 @@ python3 -m sglang.launch_server \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ + --ep-size $EP_SIZE \ + --data-parallel-size 1 \ --trust-remote-code \ --tokenizer-worker-num 6 \ --enable-aiter-allreduce-fusion \ @@ -44,7 +48,7 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ --max-prefill-tokens $MAX_PREFILL_TOKENS \ --scheduler-recv-interval 30 \ - --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index b67ab63cf..f7a01a6d1 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -9,7 +9,8 @@ check_env_vars \ ISL \ OSL \ RANDOM_RANGE_RATIO \ - RESULT_FILENAME + RESULT_FILENAME \ + EP_SIZE if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -26,6 +27,7 @@ EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then setup_eval_context EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +else EVAL_CONTEXT_ARGS="--context-length $CONTEXT_LENGTH" fi # Start GPU monitoring (power, temperature, clocks every second) start_gpu_monitor @@ -36,6 +38,8 @@ python3 -m sglang.launch_server \ --host=0.0.0.0 \ --port $PORT \ --tensor-parallel-size $TP \ + --ep-size $EP_SIZE \ + --data-parallel-size 1 \ --trust-remote-code \ --tokenizer-worker-num 6 \ --enable-aiter-allreduce-fusion \ @@ -44,7 +48,7 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ --max-prefill-tokens $MAX_PREFILL_TOKENS \ --scheduler-recv-interval 30 \ - --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! From 29aa4e11b7957717847325fd099b8f6f17f60b96 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 04:00:46 -0500 Subject: [PATCH 09/19] Update search-space configurations in amd-master.yaml for Qwen3.5 benchmarks, increasing conc-end values and adding new entries for improved performance tuning on MI355X and MI300X. --- .github/configs/amd-master.yaml | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 2345cac87..8192f2732 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -125,11 +125,13 @@ qwen3.5-bf16-mi355x-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 } qwen3.5-bf16-mi300x-sglang: image: lmsysorg/sglang:v0.5.9-rocm720-mi30x @@ -198,10 +200,12 @@ qwen3.5-fp8-mi355x-sglang: osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 16, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 4, ep: 4, conc-start: 16, conc-end: 256 } qwen3.5-fp8-mi300x-sglang: image: lmsysorg/sglang:v0.5.9-rocm720-mi30x From 4b49ca907887e5da932b2d0b8398251b1d62c5c0 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 04:00:46 -0500 Subject: [PATCH 10/19] Remove context length parameter from Qwen3.5 BF16 and FP8 benchmark scripts for MI355X to streamline configuration and improve performance. --- benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 1 - benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 1 - 2 files changed, 2 deletions(-) diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index f7a01a6d1..ce82b9a53 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -44,7 +44,6 @@ python3 -m sglang.launch_server \ --tokenizer-worker-num 6 \ --enable-aiter-allreduce-fusion \ --cuda-graph-max-bs $CONC \ - --context-length $CONTEXT_LENGTH \ --disable-radix-cache \ --max-prefill-tokens $MAX_PREFILL_TOKENS \ --scheduler-recv-interval 30 \ diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index f7a01a6d1..ce82b9a53 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -44,7 +44,6 @@ python3 -m sglang.launch_server \ --tokenizer-worker-num 6 \ --enable-aiter-allreduce-fusion \ --cuda-graph-max-bs $CONC \ - --context-length $CONTEXT_LENGTH \ --disable-radix-cache \ --max-prefill-tokens $MAX_PREFILL_TOKENS \ --scheduler-recv-interval 30 \ From 7f35da62db988e2f00328f1014f156ad152b774c Mon Sep 17 00:00:00 2001 From: "Chen, Zhentao" Date: Wed, 8 Apr 2026 04:00:46 -0500 Subject: [PATCH 11/19] update to 5.10 rocm for qwen35 --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8192f2732..8b4979ace 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -114,7 +114,7 @@ dsr1-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260327 + image: lmsysorg/sglang:v0.5.10rc0-rocm720-mi35x model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi355x @@ -188,7 +188,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260327 + image: lmsysorg/sglang:v0.5.10rc0-rocm720-mi35x model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x From 838a89b34d02a090235bfc744796effc64389597 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 04:00:46 -0500 Subject: [PATCH 12/19] Update Qwen3.5 benchmark configurations in amd-master.yaml to include EP_SIZE parameter for search-space entries, enhancing performance tuning for MI355X and MI300X. Adjusted perf-changelog.yaml to reflect updated image tag for better performance tracking. --- .github/configs/amd-master.yaml | 8 ++++---- perf-changelog.yaml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8b4979ace..7e6ccf864 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -125,12 +125,12 @@ qwen3.5-bf16-mi355x-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 256 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - { tp: 8, ep: 8, conc-start: 64, conc-end: 256 } qwen3.5-bf16-mi300x-sglang: @@ -199,12 +199,12 @@ qwen3.5-fp8-mi355x-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - { tp: 4, ep: 4, conc-start: 16, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } - { tp: 4, ep: 4, conc-start: 16, conc-end: 256 } qwen3.5-fp8-mi300x-sglang: diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 3d4fe494b..c5571cd7e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1285,6 +1285,6 @@ - qwen3.5-fp8-mi355x-sglang description: - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI355X to achieve better performance" - - "Use lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260327" + - "Use lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/980 From 5c8597dc7a81d536834b2f275c99c1632271e082 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 04:00:46 -0500 Subject: [PATCH 13/19] Update context length calculations in Qwen3.5 benchmark scripts for BF16 and FP8 to improve performance tuning. Adjusted search-space configurations in amd-master.yaml to increase conc-end values for MI355X and MI300X. --- .github/configs/amd-master.yaml | 4 ++-- benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 2 +- benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 7e6ccf864..e9baab46e 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -199,12 +199,12 @@ qwen3.5-fp8-mi355x-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - { tp: 4, ep: 4, conc-start: 16, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 64 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - { tp: 4, ep: 4, conc-start: 16, conc-end: 256 } qwen3.5-fp8-mi300x-sglang: diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index ce82b9a53..c6bed1491 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -20,7 +20,7 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -CONTEXT_LENGTH=$((ISL + OSL + 20)) +CONTEXT_LENGTH=$((ISL + OSL + 200)) MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index ce82b9a53..c6bed1491 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -20,7 +20,7 @@ hf download "$MODEL" SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -CONTEXT_LENGTH=$((ISL + OSL + 20)) +CONTEXT_LENGTH=$((ISL + OSL + 200)) MAX_PREFILL_TOKENS=32768 EVAL_CONTEXT_ARGS="" From 04fc10f369f246defa199e5d5dc55a256ce09f47 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 04:00:46 -0500 Subject: [PATCH 14/19] Update image tags in amd-master.yaml for Qwen3.5 benchmarks to v0.5.10rc0-rocm700 for MI355X and MI300X configurations, ensuring compatibility and improved performance tracking. --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index e9baab46e..071b0ef1a 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -188,7 +188,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: lmsysorg/sglang:v0.5.10rc0-rocm720-mi35x + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm700-mi35x-20260331 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x @@ -208,7 +208,7 @@ qwen3.5-fp8-mi355x-sglang: - { tp: 4, ep: 4, conc-start: 16, conc-end: 256 } qwen3.5-fp8-mi300x-sglang: - image: lmsysorg/sglang:v0.5.9-rocm720-mi30x + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm700-mi35x-20260331 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi300x From 8cb87767d4db97c6a6c0d5342061fa8015bb547f Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 04:00:46 -0500 Subject: [PATCH 15/19] Update image tags in amd-master.yaml for Qwen3.5 benchmarks, changing MI355X to v0.5.10rc0-rocm700 and MI300X to v0.5.9-rocm720, ensuring compatibility and consistency across configurations. --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 071b0ef1a..8eb83d644 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -114,7 +114,7 @@ dsr1-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: - image: lmsysorg/sglang:v0.5.10rc0-rocm720-mi35x + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm700-mi35x-20260331 model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi355x @@ -208,7 +208,7 @@ qwen3.5-fp8-mi355x-sglang: - { tp: 4, ep: 4, conc-start: 16, conc-end: 256 } qwen3.5-fp8-mi300x-sglang: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm700-mi35x-20260331 + image: lmsysorg/sglang:v0.5.9-rocm720-mi30x model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi300x From 4cc6b60ba3755214e51906ce087fb8b6d3737555 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 04:00:46 -0500 Subject: [PATCH 16/19] Remove data-parallel-size parameter and increase mem-fraction-static from 0.75 to 0.8 in Qwen3.5 BF16 and FP8 benchmark scripts to enhance performance tuning. --- benchmarks/single_node/qwen3.5_bf16_mi355x.sh | 3 +-- benchmarks/single_node/qwen3.5_fp8_mi355x.sh | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh index c6bed1491..e07d5600e 100755 --- a/benchmarks/single_node/qwen3.5_bf16_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_bf16_mi355x.sh @@ -39,7 +39,6 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --ep-size $EP_SIZE \ - --data-parallel-size 1 \ --trust-remote-code \ --tokenizer-worker-num 6 \ --enable-aiter-allreduce-fusion \ @@ -47,7 +46,7 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ --max-prefill-tokens $MAX_PREFILL_TOKENS \ --scheduler-recv-interval 30 \ - --mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh index c6bed1491..e07d5600e 100644 --- a/benchmarks/single_node/qwen3.5_fp8_mi355x.sh +++ b/benchmarks/single_node/qwen3.5_fp8_mi355x.sh @@ -39,7 +39,6 @@ python3 -m sglang.launch_server \ --port $PORT \ --tensor-parallel-size $TP \ --ep-size $EP_SIZE \ - --data-parallel-size 1 \ --trust-remote-code \ --tokenizer-worker-num 6 \ --enable-aiter-allreduce-fusion \ @@ -47,7 +46,7 @@ python3 -m sglang.launch_server \ --disable-radix-cache \ --max-prefill-tokens $MAX_PREFILL_TOKENS \ --scheduler-recv-interval 30 \ - --mem-fraction-static 0.75 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --mem-fraction-static 0.8 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! From 07f617336ed43c7c9e474a6f7f738fe185c09b02 Mon Sep 17 00:00:00 2001 From: Chun Fang Date: Wed, 8 Apr 2026 04:00:46 -0500 Subject: [PATCH 17/19] Update sglang image for qwen3.5 mi355x configs to fix shared memory crash --- .github/configs/amd-master.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 8eb83d644..22eecd50c 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -114,7 +114,7 @@ dsr1-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm700-mi35x-20260331 + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260401 model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi355x @@ -188,7 +188,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm700-mi35x-20260331 + image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260401 model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x From b8b2ab4a5377f0a9a0abd8da0b1fe60c71971d02 Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 04:00:46 -0500 Subject: [PATCH 18/19] Refine search-space configurations in amd-master.yaml for Qwen3.5 benchmarks, adjusting parameters to optimize performance for MI355X. Update perf-changelog.yaml to remove an outdated entry. --- .github/configs/amd-master.yaml | 9 +++++---- perf-changelog.yaml | 1 - 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index 22eecd50c..c41329fca 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -199,13 +199,14 @@ qwen3.5-fp8-mi355x-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 4, ep: 4, conc-start: 16, conc-end: 256 } + - { tp: 2, ep: 1, conc-start: 4, conc-end: 32 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 256 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 256 } - - { tp: 4, ep: 4, conc-start: 16, conc-end: 256 } + - { tp: 2, ep: 2, conc-start: 4, conc-end: 32 } + - { tp: 4, ep: 1, conc-start: 32, conc-end: 256 } + - { tp: 8, ep: 8, conc-start: 16, conc-end: 256 } qwen3.5-fp8-mi300x-sglang: image: lmsysorg/sglang:v0.5.9-rocm720-mi30x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c5571cd7e..e9811ff8b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1287,4 +1287,3 @@ - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI355X to achieve better performance" - "Use lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/980 - From 2bed7576b1e50d26c9671280d0ea4aa6dc6c523c Mon Sep 17 00:00:00 2001 From: "Chen, Todd" Date: Wed, 8 Apr 2026 04:03:04 -0500 Subject: [PATCH 19/19] Update image tags in amd-master.yaml and perf-changelog.yaml for Qwen3.5 benchmarks, replacing outdated sglang image references with the latest version to ensure consistency and improved performance. --- .github/configs/amd-master.yaml | 4 ++-- perf-changelog.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/configs/amd-master.yaml b/.github/configs/amd-master.yaml index c41329fca..c28e11ecf 100644 --- a/.github/configs/amd-master.yaml +++ b/.github/configs/amd-master.yaml @@ -114,7 +114,7 @@ dsr1-fp8-mi355x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-bf16-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260401 + image: lmsysorg/sglang:v0.5.10-rocm720-mi35x model: Qwen/Qwen3.5-397B-A17B model-prefix: qwen3.5 runner: mi355x @@ -188,7 +188,7 @@ qwen3.5-fp8-mi325x-sglang: - { tp: 8, conc-start: 4, conc-end: 64 } qwen3.5-fp8-mi355x-sglang: - image: lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x-20260401 + image: lmsysorg/sglang:v0.5.10-rocm720-mi35x model: Qwen/Qwen3.5-397B-A17B-FP8 model-prefix: qwen3.5 runner: mi355x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index e9811ff8b..66ca7013d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1285,5 +1285,5 @@ - qwen3.5-fp8-mi355x-sglang description: - "Update cli args of Qwen3.5 FP8 and BF16 SGLang benchmarks for MI355X to achieve better performance" - - "Use lmsysorg/sglang-rocm:v0.5.10rc0-rocm720-mi35x" + - "Use lmsysorg/sglang:v0.5.10-rocm720-mi35x" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/980