SemiAnalysisAI · lishuoshuo-amd · Apr 21, 2026 · Apr 22, 2026 · May 1, 2026 · May 1, 2026
diff --git a/benchmarks/single_node/dsr1_fp8_mi355x.sh b/benchmarks/single_node/dsr1_fp8_mi355x.sh
@@ -44,7 +44,7 @@ python3 -m sglang.launch_server \
     --trust-remote-code \
     --chunked-prefill-size 196608 \
     --mem-fraction-static 0.8 --disable-radix-cache \
-    --num-continuous-decode-steps 4 \
+    --num-continuous-decode-steps 8 \
     --max-prefill-tokens 196608 \
     --kv-cache-dtype fp8_e4m3 \
     --cuda-graph-max-bs "$CONC" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -2061,7 +2061,13 @@
     - "Add TP=4, concurrency 4-256 for 1k1k and 8k1k sequence lengths"
     - "Add --gpu-memory-utilization 0.9 to server launch"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1126
-
+
+- config-keys:
+    - dsr1-fp8-mi355x-sglang
+  description:
+    - "Tune --num-continuous-decode-steps 4 → 8 (+4.7% avg output throughput gain)"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1243
+
 - config-keys:
     - dsv4-fp4-gb200-dynamo-vllm-mtp2
   description: