diff --git a/benchmarks/single_node/dsr1_fp8_mi355x.sh b/benchmarks/single_node/dsr1_fp8_mi355x.sh index d629437cf..1ce51ec87 100644 --- a/benchmarks/single_node/dsr1_fp8_mi355x.sh +++ b/benchmarks/single_node/dsr1_fp8_mi355x.sh @@ -44,7 +44,7 @@ python3 -m sglang.launch_server \ --trust-remote-code \ --chunked-prefill-size 196608 \ --mem-fraction-static 0.8 --disable-radix-cache \ - --num-continuous-decode-steps 4 \ + --num-continuous-decode-steps 8 \ --max-prefill-tokens 196608 \ --kv-cache-dtype fp8_e4m3 \ --cuda-graph-max-bs "$CONC" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0403c2385..ee9d89aaa 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -2061,7 +2061,13 @@ - "Add TP=4, concurrency 4-256 for 1k1k and 8k1k sequence lengths" - "Add --gpu-memory-utilization 0.9 to server launch" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1126 - + +- config-keys: + - dsr1-fp8-mi355x-sglang + description: + - "Tune --num-continuous-decode-steps 4 → 8 (+4.7% avg output throughput gain)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1243 + - config-keys: - dsv4-fp4-gb200-dynamo-vllm-mtp2 description: