From e7b0fc30984f7e5cdb61c8bcbcf02a2e3c65c53e Mon Sep 17 00:00:00 2001 From: lishuoshuo-amd Date: Tue, 21 Apr 2026 13:47:11 +0000 Subject: [PATCH 1/5] =?UTF-8?q?[AMD/Hyperloom]=20Tune=20dsr1-fp8-mi355x-sg?= =?UTF-8?q?lang:=20--num-continuous-decode-steps=204=20=E2=86=92=208?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmarks/single_node/dsr1_fp8_mi325x.sh | 2 +- perf-changelog.yaml | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh index ae1e930f0..1a7dd8234 100644 --- a/benchmarks/single_node/dsr1_fp8_mi325x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh @@ -42,7 +42,7 @@ python3 -m sglang.launch_server \ --mem-fraction-static=0.8 \ --cuda-graph-max-bs=128 \ --chunked-prefill-size=131072 \ ---num-continuous-decode-steps=4 \ +--num-continuous-decode-steps=8 \ --max-prefill-tokens=131072 \ --kv-cache-dtype fp8_e4m3 \ --attention-backend aiter \ diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2bd2f025c..fef4fd11f 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,9 @@ +- config-keys: + - dsr1-fp8-mi355x-sglang + description: + - "Tune --num-continuous-decode-steps 4 → 8 (+4.7% avg output throughput gain)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1109 + - config-keys: - 70b-fp8-*-vllm description: From b10c872e5260781247d8fe2c8e145f1806006456 Mon Sep 17 00:00:00 2001 From: lishuoshuo-amd Date: Wed, 22 Apr 2026 03:13:56 +0000 Subject: [PATCH 2/5] fix: update dsr1_fp8_mi355x --- benchmarks/single_node/dsr1_fp8_mi325x.sh | 2 +- benchmarks/single_node/dsr1_fp8_mi355x.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh index 1a7dd8234..ae1e930f0 100644 --- a/benchmarks/single_node/dsr1_fp8_mi325x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh @@ -42,7 +42,7 @@ python3 -m sglang.launch_server \ --mem-fraction-static=0.8 \ --cuda-graph-max-bs=128 \ --chunked-prefill-size=131072 \ ---num-continuous-decode-steps=8 \ +--num-continuous-decode-steps=4 \ --max-prefill-tokens=131072 \ --kv-cache-dtype fp8_e4m3 \ --attention-backend aiter \ diff --git a/benchmarks/single_node/dsr1_fp8_mi355x.sh b/benchmarks/single_node/dsr1_fp8_mi355x.sh index d629437cf..1ce51ec87 100644 --- a/benchmarks/single_node/dsr1_fp8_mi355x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi355x.sh @@ -44,7 +44,7 @@ python3 -m sglang.launch_server \ --trust-remote-code \ --chunked-prefill-size 196608 \ --mem-fraction-static 0.8 --disable-radix-cache \ - --num-continuous-decode-steps 4 \ + --num-continuous-decode-steps 8 \ --max-prefill-tokens 196608 \ --kv-cache-dtype fp8_e4m3 \ --cuda-graph-max-bs "$CONC" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & From 81f861f9003a205bb51ea50686c1404d2ac87e15 Mon Sep 17 00:00:00 2001 From: lishuoshuo-amd Date: Fri, 1 May 2026 08:12:58 +0800 Subject: [PATCH 3/5] fix: update changelog PR link --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index fef4fd11f..f8adb0731 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2,7 +2,7 @@ - dsr1-fp8-mi355x-sglang description: - "Tune --num-continuous-decode-steps 4 → 8 (+4.7% avg output throughput gain)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1109 + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1243 - config-keys: - 70b-fp8-*-vllm From f13cd186a3e4294a735f2772468fe6a487c24acc Mon Sep 17 00:00:00 2001 From: lishuoshuo-amd Date: Fri, 1 May 2026 08:51:14 +0800 Subject: [PATCH 4/5] fix: append changelog entry --- perf-changelog.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index c81a92f32..f3da625ad 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,9 +1,3 @@ -- config-keys: - - dsr1-fp8-mi355x-sglang - description: - - "Tune --num-continuous-decode-steps 4 → 8 (+4.7% avg output throughput gain)" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1243 - - config-keys: - 70b-fp8-*-vllm description: @@ -2044,3 +2038,9 @@ - updated sglang container image pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1027 +- config-keys: + - dsr1-fp8-mi355x-sglang + description: + - "Tune --num-continuous-decode-steps 4 → 8 (+4.7% avg output throughput gain)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1243 + From bf3d59fdd4fbdb7849eadffc7d2877ebfee77878 Mon Sep 17 00:00:00 2001 From: lishuoshuo-amd Date: Fri, 1 May 2026 08:57:50 +0800 Subject: [PATCH 5/5] chore: trigger sweep Made-with: Cursor