SemiAnalysisAI · Oseltamivir · Apr 25, 2026 · Apr 25, 2026 · Apr 25, 2026
diff --git a/benchmarks/single_node/dsv4_fp8_mi355x.sh b/benchmarks/single_node/dsv4_fp8_mi355x.sh
@@ -83,13 +83,11 @@ python3 -m sglang.launch_server \
     --attention-backend compressed \
     --max-running-request 256 \
     --page-size 256 \
-    --chunked-prefill-size 4096 \
+    --chunked-prefill-size 8192 \
     --disable-shared-experts-fusion \
     --disable-cuda-graph \
     --tool-call-parser deepseekv4 \
     --reasoning-parser deepseek-v4 \
-    --mem-fraction-static 0.88 \
-    --max-total-tokens $((CONC * (ISL + OSL) + 200)) \
     --watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1819,3 +1819,10 @@
     - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
     - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132
+
+- config-keys:
+    - dsv4-fp8-mi355x-sglang
+  description:
+    - "Drop --mem-fraction-static 0.88 and --max-total-tokens from dsv4_fp8_mi355x.sh"
+    - "Bump --chunked-prefill-size from 4096 to 8192"
+    - "Retrigger dsv4-fp8-mi355x-sglang"