SemiAnalysisAI · cquil11 · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026 · Apr 26, 2026
@@ -1832,40 +1832,33 @@ dsr1-fp8-b300-sglang:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
     - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }
 
-# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-# lists B200 (not B300) as the Blackwell target. This config reuses the
-# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300
-# until a B300-specific recipe ships. Prefix caching is disabled.
-# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
+# NOTE: Low-latency fallback (TP=8, EP=1, no DP-attn, no DeepEP) while
+# the DeepEP FP8 weight-postprocess path is broken for DeepSeek-V4-Pro
+# on B300. Re-introduce balanced/max-throughput rows once fixed upstream.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:deepseek-v4-b300@sha256:26e116bd211e300dbb76924d56c5cbe6cc3ee5ee2fe314859cb8774f5bc070f3
+  image: lmsysorg/sglang:deepseek-v4-b300
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
   precision: fp4
   framework: sglang
   multinode: false
-  # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-  # are selected inside benchmarks/single_node/dsv4_fp4_b300.sh by CONC:
-  #   low-latency    (CONC <= 32):       TP-only
-  #   balanced       (32 < CONC <= 128): + DP-attn
-  #   max-throughput (CONC > 128):       + DP-attn
-  # Split so result filenames (ep=, dpa=) accurately reflect the recipe.
-  # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
-  # while low-latency leaves ep_size at the default of 1.
+  # TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP)
+  # while the DeepEP FP8 weight-postprocess path is broken for this
+  # checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3
+  # integers. raised from sglang.srt.layers.quantization.fp8
+  # .process_weights_after_loading_block_quant). Full concurrency sweep
+  # retained; revert to the recipe-per-CONC split on chore/dsv4-sgl-b300
+  # once sglang can load the checkpoint under --moe-a2a-backend deepep.
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-    - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 1, conc-end: 1 }
-    - { tp: 4, ep: 1, conc-start: 32, conc-end: 32 }
-    - { tp: 4, ep: 4, dp-attn: true, conc-start: 512, conc-end: 512 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }
 
 # DeepSeek-V4-Pro on B300 with EAGLE/MTP speculative decoding. Recipe is
 # selected inside benchmarks/single_node/dsv4_fp4_b300_sglang_mtp.sh by

diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -5,7 +5,6 @@ source "$(dirname "$0")/../benchmark_lib.sh"
 check_env_vars \
     MODEL \
     TP \
-    DP_ATTENTION \
     CONC \
     ISL \
     OSL \
@@ -24,13 +23,12 @@ fi
 
 nvidia-smi
 
-# Common SGLANG env vars (apply to every config).
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
-export SGLANG_OPT_SWA_SPLIT_LEAF_ON_INSERT=1
-export SGLANG_OPT_USE_JIT_NORM=1
-export SGLANG_OPT_USE_JIT_INDEXER_METADATA=1
-export SGLANG_OPT_USE_TOPK_V2=1
-export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
+
+# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its
+# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half
+# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks.
+unset CUDA_VISIBLE_DEVICES
 
 # TODO(Cam): the deepseek-v4 sglang images install sglang editable at
 # /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
@@ -42,7 +40,7 @@ export SGLANG_OPT_USE_CUSTOM_ALL_REDUCE_V2=1
 SERVER_LOG="$PWD/server.log"
 PORT=${PORT:-8888}
 
-echo "TP: $TP, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -52,57 +50,21 @@ fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
-# 1k inputs need more SWA cache headroom on B300 than 8k inputs do; 0.5 was
-# tuned empirically for the 1k1k recipe, while 0.1 is the cookbook default.
-if [[ "$ISL" == "1024" ]]; then
-    SWA_FULL_TOKENS_RATIO=0.5
-else
-    SWA_FULL_TOKENS_RATIO=0.1
-fi
-
-# Pick the parallelism + MoE backend based on DP_ATTENTION (mirrors the vllm
-# script's pattern). DP-attention runs the empirically-tuned high-concurrency
-# recipe (flashinfer_mxfp4 runner + halved prefill chunks + prefill-delayer);
-# single-instance uses flashinfer_mxfp4 with the cookbook defaults.
-DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
-
-# Default; the DP-attn branch below overrides to 0.94.
-MEM_FRACTION_STATIC=0.90
-
-if [ "${DP_ATTENTION}" = "true" ]; then
-    export SGLANG_OPT_SWA_EVICT_DROP_PAGE_MARGIN=1
-    export SGLANG_OPT_USE_DEEPGEMM_MEGA_MOE=0
-    export SGLANG_OPT_FIX_HASH_MEGA_MOE=0
-    export SGLANG_OPT_USE_FAST_MASK_EP=1
-    export SGLANG_OPT_FIX_MEGA_MOE_MEMORY=1
-    export SGLANG_OPT_DEEPGEMM_MEGA_MOE_NUM_MAX_TOKENS_PER_RANK=4096
-    export SGLANG_OPT_FIX_NEXTN_MEGA_MOE=1
-    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=0
-    PARALLEL_ARGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-runner-backend flashinfer_mxfp4
-        --disable-flashinfer-autotune
-        --deepep-config "$DEEPEP_CONFIG"
-        --chunked-prefill-size 16384
-        --enable-prefill-delayer
-    )
-    MEM_FRACTION_STATIC=0.94
-else
-    PARALLEL_ARGS=(
-        --moe-runner-backend flashinfer_mxfp4
-        --chunked-prefill-size 8192
-        --disable-flashinfer-autotune
-    )
-fi
-
-# Print all SGLANG_* env vars to both the CI step log and server.log so the
-# launch config is auditable from the result artifact alone.
-{
-    echo "=== SGLANG_* env vars at launch ==="
-    env | grep -E '^SGLANG_' | sort
-    echo "==================================="
-} | tee "$SERVER_LOG"
+# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the
+# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300
+# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from
+# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant).
+# Restore the CONC-based low-latency / balanced / max-throughput dispatch
+# on chore/dsv4-sgl-b300 once sglang can load the checkpoint under
+# --moe-a2a-backend deepep.
+RECIPE=low-latency
+RECIPE_FLAGS=(
+    --moe-runner-backend flashinfer_mxfp4
+    --chunked-prefill-size 4096
+    --disable-flashinfer-autotune
+    --mem-fraction-static 0.82
+)
+echo "Recipe: $RECIPE (CONC=$CONC)"
 
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
@@ -111,10 +73,8 @@ PYTHONNOUSERSITE=1 sglang serve \
     --port $PORT \
     --trust-remote-code \
     --tp $TP \
-    --max-running-requests "$(( CONC * 3 / 2 > 8 ? CONC * 3 / 2 : 8 ))" \
-    --mem-fraction-static "$MEM_FRACTION_STATIC" \
-    --swa-full-tokens-ratio "$SWA_FULL_TOKENS_RATIO" \
-    "${PARALLEL_ARGS[@]}" $EVAL_CONTEXT_ARGS >> $SERVER_LOG 2>&1 &
+    --disable-radix-cache \
+    "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
@@ -1779,13 +1779,6 @@
     - "Prefix caching and speculative decoding disabled for baseline numbers"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131
 
-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
-    - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158
-
 - config-keys:
     - dsv4-fp8-mi355x-sglang
   description:
@@ -1856,26 +1849,6 @@
     - "Sweep will expand to TP=4/8 conc 4–256 once ROCm/ATOM PR3 (multi-request) and PR4 (CUDAGraph) land"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1170
 
-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
-    - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1158
-
-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
-    - "Floor --max-running-requests at 8 in dsv4_fp4_b300_sglang.sh so low-CONC sweeps don't drop below the queue depth needed for stable benchmarking (CONC * 3 / 2 still applies above CONC=5)"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1173
-
-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
-    - "better performance for dp-attention"
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1174
-
 - config-keys:
     - dsv4-fp4-b300-sglang-mtp
   description:
@@ -1888,13 +1861,6 @@
     - "Configs: 1k1k and 8k1k, no validation.py / launcher / yaml-field changes (knob-free)"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1166
 
-- config-keys:
-    - dsv4-fp4-b300-sglang
-  description:
-    - "better performance for dp-attention"
-    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1178
-
 - config-keys:
    - dsv4-fp4-b300-vllm
   description: