diff --git a/benchmarks/single_node/dsv4_fp8_mi355x.sh b/benchmarks/single_node/dsv4_fp8_mi355x.sh index 8ca19f71f..971b18b6a 100755 --- a/benchmarks/single_node/dsv4_fp8_mi355x.sh +++ b/benchmarks/single_node/dsv4_fp8_mi355x.sh @@ -83,13 +83,11 @@ python3 -m sglang.launch_server \ --attention-backend compressed \ --max-running-request 256 \ --page-size 256 \ - --chunked-prefill-size 4096 \ + --chunked-prefill-size 8192 \ --disable-shared-experts-fusion \ --disable-cuda-graph \ --tool-call-parser deepseekv4 \ --reasoning-parser deepseek-v4 \ - --mem-fraction-static 0.88 \ - --max-total-tokens $((CONC * (ISL + OSL) + 200)) \ --watchdog-timeout 1800 $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 397da6591..14d9e6704 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1819,3 +1819,10 @@ - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132 + +- config-keys: + - dsv4-fp8-mi355x-sglang + description: + - "Drop --mem-fraction-static 0.88 and --max-total-tokens from dsv4_fp8_mi355x.sh" + - "Bump --chunked-prefill-size from 4096 to 8192" + - "Retrigger dsv4-fp8-mi355x-sglang"