From 358e17b3ee18059a93d99b8c74097dcbe81adbb9 Mon Sep 17 00:00:00 2001 From: sglang-bot Date: Sun, 26 Apr 2026 03:30:38 -0700 Subject: [PATCH 1/3] dsv4-fp4 sglang b200/b300: floor --max-running-requests at 8 Mirrors the floor-of-4 pattern from the mi355x atom script (#1170); prevents tiny CONC values from yielding sub-optimal max-running-requests. Co-Authored-By: Claude Opus 4.7 --- benchmarks/single_node/dsv4_fp4_b200.sh | 2 +- benchmarks/single_node/dsv4_fp4_b300_sglang.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index e7a676b45..f691106e6 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -90,7 +90,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --max-running-requests "$((CONC * 3 / 2))" \ + --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ --mem-fraction-static 0.90 \ --swa-full-tokens-ratio 0.1 \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index 6fae10837..dededd071 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -110,7 +110,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --max-running-requests "$((CONC * 3 / 2))" \ + --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ --mem-fraction-static "$MEM_FRACTION_STATIC" \ --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & From 90faa5305203e004c4f3e301a1316e2dc242d237 Mon Sep 17 00:00:00 2001 From: sglang-bot Date: Sun, 26 Apr 2026 03:35:38 -0700 Subject: [PATCH 2/3] perf-changelog: add entry for #1173 (max-running-requests floor) Co-Authored-By: Claude Opus 4.7 --- perf-changelog.yaml | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 0bce77831..a627cbc10 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1862,3 +1862,10 @@ - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158 + +- config-keys: + - dsv4-fp4-b200-sglang + - dsv4-fp4-b300-sglang + description: + - "Floor --max-running-requests at 8 in dsv4_fp4_b200.sh and dsv4_fp4_b300_sglang.sh so low-CONC sweeps don't drop below the queue depth needed for stable benchmarking (CONC * 3 / 2 still applies above CONC=5)" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1173 From e0f0c32db23866e7fc1f87edeb33e31f079835cd Mon Sep 17 00:00:00 2001 From: sglang-bot Date: Sun, 26 Apr 2026 03:36:46 -0700 Subject: [PATCH 3/3] revert b200 change; scope to b300 only Co-Authored-By: Claude Opus 4.7 --- benchmarks/single_node/dsv4_fp4_b200.sh | 2 +- perf-changelog.yaml | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index f691106e6..e7a676b45 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -90,7 +90,7 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \ + --max-running-requests "$((CONC * 3 / 2))" \ --mem-fraction-static 0.90 \ --swa-full-tokens-ratio 0.1 \ "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a627cbc10..77c2dd31e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1864,8 +1864,7 @@ pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158 - config-keys: - - dsv4-fp4-b200-sglang - dsv4-fp4-b300-sglang description: - - "Floor --max-running-requests at 8 in dsv4_fp4_b200.sh and dsv4_fp4_b300_sglang.sh so low-CONC sweeps don't drop below the queue depth needed for stable benchmarking (CONC * 3 / 2 still applies above CONC=5)" + - "Floor --max-running-requests at 8 in dsv4_fp4_b300_sglang.sh so low-CONC sweeps don't drop below the queue depth needed for stable benchmarking (CONC * 3 / 2 still applies above CONC=5)" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1173