SemiAnalysisAI · cquil11 · Apr 25, 2026 · Apr 24, 2026 · Apr 24, 2026 · Apr 24, 2026
@@ -1832,9 +1832,11 @@ dsr1-fp8-b300-sglang:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
     - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }
 
-# NOTE: Low-latency fallback (TP=8, EP=1, no DP-attn, no DeepEP) while
-# the DeepEP FP8 weight-postprocess path is broken for DeepSeek-V4-Pro
-# on B300. Re-introduce balanced/max-throughput rows once fixed upstream.
+# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
+# lists B200 (not B300) as the Blackwell target. This config reuses the
+# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300
+# until a B300-specific recipe ships. Prefix caching is disabled.
+# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
 dsv4-fp4-b300-sglang:
   image: lmsysorg/sglang:deepseek-v4-b300
   model: deepseek-ai/DeepSeek-V4-Pro
@@ -1843,22 +1845,33 @@ dsv4-fp4-b300-sglang:
   precision: fp4
   framework: sglang
   multinode: false
-  # TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP)
-  # while the DeepEP FP8 weight-postprocess path is broken for this
-  # checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3
-  # integers. raised from sglang.srt.layers.quantization.fp8
-  # .process_weights_after_loading_block_quant). Full concurrency sweep
-  # retained; revert to the recipe-per-CONC split on chore/dsv4-sgl-b300
-  # once sglang can load the checkpoint under --moe-a2a-backend deepep.
+  # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
+  # are selected inside benchmarks/single_node/dsv4_fp4_b300.sh by CONC:
+  #   low-latency    (CONC <= 32):       TP-only
+  #   balanced       (32 < CONC <= 128): + DP-attn
+  #   max-throughput (CONC > 128):       + DP-attn
+  # Split so result filenames (ep=, dpa=) accurately reflect the recipe.
+  # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
+  # while low-latency leaves ep_size at the default of 1.
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
+    # low-latency
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
+    # balanced
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
+    # max-throughput
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }
+    # low-latency
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
+    # balanced
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
+    # max-throughput
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }
 
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -50,20 +50,46 @@ fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
-# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the
-# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300
-# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from
-# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant).
-# Restore the CONC-based low-latency / balanced / max-throughput dispatch
-# on chore/dsv4-sgl-b300 once sglang can load the checkpoint under
-# --moe-a2a-backend deepep.
-RECIPE=low-latency
-RECIPE_FLAGS=(
-    --moe-runner-backend flashinfer_mxfp4
-    --chunked-prefill-size 4096
-    --disable-flashinfer-autotune
-    --mem-fraction-static 0.82
-)
+# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
+# (spec-decoding / MTP and prefix-caching flags dropped for the baseline):
+#   - low-latency    (CONC <= 32):        TP-only, chunked-prefill, disable autotune
+#   - balanced       (32 < CONC <= 128):  + DP-attn, max-running-requests=128
+#   - max-throughput (CONC > 128):        + DP-attn, max-running-requests=256
+DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+if [[ $CONC -le 32 ]]; then
+    RECIPE=low-latency
+    RECIPE_FLAGS=(
+        --moe-runner-backend flashinfer_mxfp4
+        --chunked-prefill-size 4096
+        --disable-flashinfer-autotune
+        --mem-fraction-static 0.82
+    )
+elif [[ $CONC -le 128 ]]; then
+    RECIPE=balanced
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
+    RECIPE_FLAGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+        --mem-fraction-static 0.82
+        --cuda-graph-max-bs 64
+        --max-running-requests 128
+    )
+else
+    RECIPE=max-throughput
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
+    RECIPE_FLAGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+        --mem-fraction-static 0.82
+        --cuda-graph-max-bs 64
+        --max-running-requests 256
+    )
+fi
 echo "Recipe: $RECIPE (CONC=$CONC)"
 
 set -x

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1812,3 +1812,10 @@
     - "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern"
     - "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
+    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132