From e7b0fc30984f7e5cdb61c8bcbcf02a2e3c65c53e Mon Sep 17 00:00:00 2001
From: lishuoshuo-amd <shuoli@amd.com>
Date: Tue, 21 Apr 2026 13:47:11 +0000
Subject: [PATCH 1/5] =?UTF-8?q?[AMD/Hyperloom]=20Tune=20dsr1-fp8-mi355x-sg?=
 =?UTF-8?q?lang:=20--num-continuous-decode-steps=204=20=E2=86=92=208?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 benchmarks/single_node/dsr1_fp8_mi325x.sh | 2 +-
 perf-changelog.yaml                       | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh
index ae1e930f0..1a7dd8234 100644
--- a/benchmarks/single_node/dsr1_fp8_mi325x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh
@@ -42,7 +42,7 @@ python3 -m sglang.launch_server \
 --mem-fraction-static=0.8 \
 --cuda-graph-max-bs=128 \
 --chunked-prefill-size=131072 \
---num-continuous-decode-steps=4 \
+--num-continuous-decode-steps=8 \
 --max-prefill-tokens=131072 \
 --kv-cache-dtype fp8_e4m3 \
 --attention-backend aiter \
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2bd2f025c..fef4fd11f 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,3 +1,9 @@
+- config-keys:
+    - dsr1-fp8-mi355x-sglang
+  description:
+    - "Tune --num-continuous-decode-steps 4 → 8 (+4.7% avg output throughput gain)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1109
+
 - config-keys:
     - 70b-fp8-*-vllm
   description:

From b10c872e5260781247d8fe2c8e145f1806006456 Mon Sep 17 00:00:00 2001
From: lishuoshuo-amd <shuoli@amd.com>
Date: Wed, 22 Apr 2026 03:13:56 +0000
Subject: [PATCH 2/5] fix: update dsr1_fp8_mi355x

---
 benchmarks/single_node/dsr1_fp8_mi325x.sh | 2 +-
 benchmarks/single_node/dsr1_fp8_mi355x.sh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/benchmarks/single_node/dsr1_fp8_mi325x.sh b/benchmarks/single_node/dsr1_fp8_mi325x.sh
index 1a7dd8234..ae1e930f0 100644
--- a/benchmarks/single_node/dsr1_fp8_mi325x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi325x.sh
@@ -42,7 +42,7 @@ python3 -m sglang.launch_server \
 --mem-fraction-static=0.8 \
 --cuda-graph-max-bs=128 \
 --chunked-prefill-size=131072 \
---num-continuous-decode-steps=8 \
+--num-continuous-decode-steps=4 \
 --max-prefill-tokens=131072 \
 --kv-cache-dtype fp8_e4m3 \
 --attention-backend aiter \
diff --git a/benchmarks/single_node/dsr1_fp8_mi355x.sh b/benchmarks/single_node/dsr1_fp8_mi355x.sh
index d629437cf..1ce51ec87 100644
--- a/benchmarks/single_node/dsr1_fp8_mi355x.sh
+++ b/benchmarks/single_node/dsr1_fp8_mi355x.sh
@@ -44,7 +44,7 @@ python3 -m sglang.launch_server \
     --trust-remote-code \
     --chunked-prefill-size 196608 \
     --mem-fraction-static 0.8 --disable-radix-cache \
-    --num-continuous-decode-steps 4 \
+    --num-continuous-decode-steps 8 \
     --max-prefill-tokens 196608 \
     --kv-cache-dtype fp8_e4m3 \
     --cuda-graph-max-bs "$CONC" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

From 81f861f9003a205bb51ea50686c1404d2ac87e15 Mon Sep 17 00:00:00 2001
From: lishuoshuo-amd <shuoli@amd.com>
Date: Fri, 1 May 2026 08:12:58 +0800
Subject: [PATCH 3/5] fix: update changelog PR link

---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index fef4fd11f..f8adb0731 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -2,7 +2,7 @@
     - dsr1-fp8-mi355x-sglang
   description:
     - "Tune --num-continuous-decode-steps 4 → 8 (+4.7% avg output throughput gain)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1109
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1243
 
 - config-keys:
     - 70b-fp8-*-vllm

From f13cd186a3e4294a735f2772468fe6a487c24acc Mon Sep 17 00:00:00 2001
From: lishuoshuo-amd <shuoli@amd.com>
Date: Fri, 1 May 2026 08:51:14 +0800
Subject: [PATCH 4/5] fix: append changelog entry

---
 perf-changelog.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index c81a92f32..f3da625ad 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,9 +1,3 @@
-- config-keys:
-    - dsr1-fp8-mi355x-sglang
-  description:
-    - "Tune --num-continuous-decode-steps 4 → 8 (+4.7% avg output throughput gain)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1243
-
 - config-keys:
     - 70b-fp8-*-vllm
   description:
@@ -2044,3 +2038,9 @@
     - updated sglang container image 
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1027
 
+- config-keys:
+    - dsr1-fp8-mi355x-sglang
+  description:
+    - "Tune --num-continuous-decode-steps 4 → 8 (+4.7% avg output throughput gain)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1243
+

From bf3d59fdd4fbdb7849eadffc7d2877ebfee77878 Mon Sep 17 00:00:00 2001
From: lishuoshuo-amd <shuoli@amd.com>
Date: Fri, 1 May 2026 08:57:50 +0800
Subject: [PATCH 5/5] chore: trigger sweep

Made-with: Cursor