From 26e540d9e81ee12dc1fc9505b6e33b93b7d2f374 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:14:38 -0500
Subject: [PATCH 01/24] feat: add DeepSeek-V4-Flash FP4 B300 SGLang benchmark

Adds dsv4-fp4-b300-sglang config, single-node benchmark script, and
perf-changelog entry for the DeepSeek-V4 recipe from the SGLang
cookbook. The cookbook ships a B200 (not B300) recipe, so this
reuses the B200 Flash Low-Latency recipe on B300 until a
B300-specific recipe lands. Speculative decoding (EAGLE) and prefix
caching are disabled per request.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml      | 22 +++++++
 benchmarks/single_node/dsv4_fp4_b300.sh | 76 +++++++++++++++++++++++++
 perf-changelog.yaml                     | 13 ++++-
 3 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100755 benchmarks/single_node/dsv4_fp4_b300.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index ec9cbc11e..a7dcdb20f 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1796,6 +1796,28 @@ dsr1-fp8-b300-sglang:
     - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
     - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }
 
+# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
+# lists B200 (not B300) as the Blackwell target. This config reuses the
+# B200 Flash FP4 Low-Latency recipe on B300 until a B300-specific recipe
+# ships. Speculative decoding (EAGLE) and prefix caching are disabled.
+dsv4-fp4-b300-sglang:
+  image: lmsysorg/sglang:deepseek-v4-blackwell
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b300
+  precision: fp4
+  framework: sglang
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
   model: Qwen/Qwen3.5-397B-A17B
diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh
new file mode 100755
index 000000000..dc0244f36
--- /dev/null
+++ b/benchmarks/single_node/dsv4_fp4_b300.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+
+# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
+# only ships a B200 recipe for Blackwell. This script reuses the B200 Flash
+# FP4 Low-Latency recipe as-is on B300 until a B300-specific recipe ships.
+# Speculative decoding (EAGLE) and prefix caching are disabled per request.
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME \
+    EP_SIZE
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+hf download "$MODEL"
+
+nvidia-smi
+
+export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+echo "TP: $TP, EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+
+start_gpu_monitor
+
+set -x
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
+--trust-remote-code \
+--tensor-parallel-size=$TP --ep-size $EP_SIZE \
+--moe-runner-backend flashinfer_mxfp4 \
+--chunked-prefill-size 4096 \
+--disable-flashinfer-autotune \
+--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts "$((CONC * 10))" \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index ddc6409c2..41c5c080d 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1725,7 +1725,7 @@
     - "Add Qwen3.5-397B-A17B FP8 MI355X ATOM benchmark configs with and without MTP"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1040
   
-- config-keys:  
+- config-keys:
     - glm5.1-fp4-mi355x-atom
   description:
     - "Add GLM-5.1 MXFP4 single-node MI355X ATOM benchmark"
@@ -1733,3 +1733,14 @@
     - "TP=2 and TP=4, concurrency 4-256 for 1k1k and 8k1k sequence lengths"
     - "Add --max-num-seqs and --gpu-memory-utilization 0.9 to server launch"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1043
+
+- config-keys:
+    - dsv4-fp4-b300-sglang
+  description:
+    - "Add DeepSeek-V4-Flash FP4 B300 SGLang benchmark"
+    - "Image: lmsysorg/sglang:deepseek-v4-blackwell"
+    - "Model: deepseek-ai/DeepSeek-V4-Flash (FP4 MoE experts + FP8 attention/dense)"
+    - "Reuses the B200 Flash Low-Latency recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 on B300 until a B300-specific recipe ships"
+    - "Speculative decoding (EAGLE) and prefix caching disabled"
+    - "TP=4/EP=4, concurrency 4-128 for 1k1k and 8k1k"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX

From efdc8ba8622e09207d4487423f478a70cb367bbc Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:18:57 -0500
Subject: [PATCH 02/24] fix: switch dsv4-fp4-b300-sglang to Pro +
 Max-Throughput recipe

Match parallelism (TP=8/EP=8/dp-attn=true) and concurrency ranges
(4-1024 for 1k1k, 4-512 for 8k1k) to dsv4-fp4-b200-vllm. Use the
DeepSeek-V4-Pro variant with the cookbook Max-Throughput recipe
(DP=8 + DeepEP, no MTP), which aligns with the requested no-spec
parallelism. Prefix caching remains disabled.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml      | 11 +++++-----
 benchmarks/single_node/dsv4_fp4_b300.sh | 28 +++++++++++++++++--------
 perf-changelog.yaml                     | 12 +++++------
 3 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index a7dcdb20f..458c4c928 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1798,11 +1798,12 @@ dsr1-fp8-b300-sglang:
 
 # NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
 # lists B200 (not B300) as the Blackwell target. This config reuses the
-# B200 Flash FP4 Low-Latency recipe on B300 until a B300-specific recipe
-# ships. Speculative decoding (EAGLE) and prefix caching are disabled.
+# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300
+# until a B300-specific recipe ships. Prefix caching is disabled.
+# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
 dsv4-fp4-b300-sglang:
   image: lmsysorg/sglang:deepseek-v4-blackwell
-  model: deepseek-ai/DeepSeek-V4-Flash
+  model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
   precision: fp4
@@ -1812,11 +1813,11 @@ dsv4-fp4-b300-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 1024 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 512 }
 
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh
index dc0244f36..89b87ac24 100755
--- a/benchmarks/single_node/dsv4_fp4_b300.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300.sh
@@ -1,9 +1,10 @@
 #!/usr/bin/env bash
 
 # NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-# only ships a B200 recipe for Blackwell. This script reuses the B200 Flash
-# FP4 Low-Latency recipe as-is on B300 until a B300-specific recipe ships.
-# Speculative decoding (EAGLE) and prefix caching are disabled per request.
+# only ships a B200 recipe for Blackwell. This script reuses the B200
+# DeepSeek-V4-Pro Max-Throughput recipe (DP=8 + DeepEP, no MTP) as-is on
+# B300 until a B300-specific recipe ships. Parallelism and concurrency
+# ranges mirror dsv4-fp4-b200-vllm. Prefix caching is disabled.
 
 source "$(dirname "$0")/../benchmark_lib.sh"
 
@@ -15,7 +16,8 @@ check_env_vars \
     OSL \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME \
-    EP_SIZE
+    EP_SIZE \
+    DP_ATTENTION
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
@@ -26,11 +28,17 @@ hf download "$MODEL"
 nvidia-smi
 
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
 
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
-echo "TP: $TP, EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+
+DP_ATTN_ARGS=""
+if [ "$DP_ATTENTION" = "true" ]; then
+    DP_ATTN_ARGS="--data-parallel-size $TP --enable-dp-attention"
+fi
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -43,10 +51,12 @@ start_gpu_monitor
 set -x
 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
 --trust-remote-code \
---tensor-parallel-size=$TP --ep-size $EP_SIZE \
---moe-runner-backend flashinfer_mxfp4 \
---chunked-prefill-size 4096 \
---disable-flashinfer-autotune \
+--tensor-parallel-size=$TP --ep-size $EP_SIZE $DP_ATTN_ARGS \
+--moe-a2a-backend deepep \
+--deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' \
+--mem-fraction-static 0.82 \
+--cuda-graph-max-bs 64 \
+--max-running-requests 256 \
 --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 41c5c080d..bc8c1bffe 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1737,10 +1737,10 @@
 - config-keys:
     - dsv4-fp4-b300-sglang
   description:
-    - "Add DeepSeek-V4-Flash FP4 B300 SGLang benchmark"
+    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark"
     - "Image: lmsysorg/sglang:deepseek-v4-blackwell"
-    - "Model: deepseek-ai/DeepSeek-V4-Flash (FP4 MoE experts + FP8 attention/dense)"
-    - "Reuses the B200 Flash Low-Latency recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 on B300 until a B300-specific recipe ships"
-    - "Speculative decoding (EAGLE) and prefix caching disabled"
-    - "TP=4/EP=4, concurrency 4-128 for 1k1k and 8k1k"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
+    - "Model: deepseek-ai/DeepSeek-V4-Pro (FP4 MoE experts + FP8 attention/dense)"
+    - "Reuses the B200 Pro Max-Throughput recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 on B300 until a B300-specific recipe ships"
+    - "DP=8 + DeepEP, prefix caching disabled, no speculative decoding"
+    - "Parallelism (TP=8/EP=8/dp-attn=true) and concurrency ranges (4-1024 for 1k1k, 4-512 for 8k1k) mirror dsv4-fp4-b200-vllm"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132

From cc35a12e0ede9bae596aa45d6a1ff4009d46f10f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:22:09 -0500
Subject: [PATCH 03/24] chore: sync launch_b200-dgxc-slurm.sh cache mount from
 claude/add-dsv4-fp4-b200-vllm

Port the HF cache mount rework from the DSV4 B200 VLLM branch so
both PRs stay consistent: use the shared /scratch/fsw/gharunners/hf-hub-cache
path, drop the local MODEL override, and mount onto \$HF_HUB_CACHE
inside the container.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b200-dgxc-slurm.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index c0f25310b..b9d4d90cc 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -249,8 +249,7 @@ EOF
 
 else
 
-    HF_HUB_CACHE_MOUNT="/scratch/fsw/models"
-    export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}"
+    HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache"
     SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
     FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
@@ -276,7 +275,7 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
+        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
         --no-container-mount-home \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \

From 404a097a6d5c3b28e1e89309fe2ddb2e48d60f87 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:23:12 -0500
Subject: [PATCH 04/24] fix: restore trailing whitespace stripped from glm5.1
 changelog entry

The dsv4-fp4-b300-sglang entry was appended correctly, but the earlier
edit also stripped trailing spaces on an existing line, producing a
spurious deletion. Revert so the diff is additive-only.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 perf-changelog.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index bc8c1bffe..5b00b2f3e 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1725,7 +1725,7 @@
     - "Add Qwen3.5-397B-A17B FP8 MI355X ATOM benchmark configs with and without MTP"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1040
   
-- config-keys:
+- config-keys:  
     - glm5.1-fp4-mi355x-atom
   description:
     - "Add GLM-5.1 MXFP4 single-node MI355X ATOM benchmark"

From 97a488e978b5e3b787df04ccdf35a0a4622dfd43 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:29:10 -0500
Subject: [PATCH 05/24] chore: add flock-guarded squash import to B300 runner

Mirror the lockfile logic already in launch_b200-dgxc-slurm.sh and
launch_h200-dgxc-slurm.sh: serialize concurrent enroot imports of
the same squash file via flock, skip the import when the squash is
already valid, and override ENROOT_CACHE_PATH to avoid permission
issues with the system-wide cache on worker nodes.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b300-nv.sh | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index b49391a3c..1d8bd59b4 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -258,13 +258,27 @@ else
     SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
     FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
+    LOCK_FILE="${SQUASH_FILE}.lock"
 
     # Pin to one of the known-good B300 nodes; others have hardware/network
     # issues that cause benchmarks to hang or fail to start.
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
-    srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
+    # Use flock to serialize concurrent imports to the same squash file
+    # Override ENROOT_CACHE_PATH to avoid permission issues with system-wide cache on worker nodes
+    srun --jobid=$JOB_ID bash -c "
+        export ENROOT_CACHE_PATH=\$HOME/.cache/enroot
+        mkdir -p \$ENROOT_CACHE_PATH
+        exec 9>\"$LOCK_FILE\"
+        flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
+        if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
+            echo 'Squash file already exists and is valid, skipping import'
+        else
+            rm -f \"$SQUASH_FILE\"
+            enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+        fi
+    "
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \

From 106deeaae8d594789b4467429b9e0edd2effbc2f Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:45:45 -0500
Subject: [PATCH 06/24] fix: drop ENROOT_CACHE_PATH override from B300 runner

The override ("avoid permission issues with system-wide cache on
worker nodes") is a dgxc-slurm-specific workaround; launch_b300-nv.sh
is on the NV slurm cluster, not dgxc-slurm. Copying it in caused
the benchmark srun's pyxis shadow hook to fail with
'mkdir: cannot create directory pyxis_$JOBID.1/data: File exists'.
Keep the flock + skip-if-valid logic.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b300-nv.sh | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 1d8bd59b4..51596b7b7 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -266,10 +266,7 @@ else
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
     # Use flock to serialize concurrent imports to the same squash file
-    # Override ENROOT_CACHE_PATH to avoid permission issues with system-wide cache on worker nodes
     srun --jobid=$JOB_ID bash -c "
-        export ENROOT_CACHE_PATH=\$HOME/.cache/enroot
-        mkdir -p \$ENROOT_CACHE_PATH
         exec 9>\"$LOCK_FILE\"
         flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
         if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then

From 4bb1f1ae599abf76cd954dbd0b0611d7caf4609a Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 02:07:39 -0500
Subject: [PATCH 07/24] chore: point B300 runner at shared
 gharunners/{squash,hf-hub-cache}

Move the squash cache from /data/squash to /data/home/sa-shared/gharunners/squash,
and the HF cache mount from /scratch/models to /data/home/sa-shared/gharunners/hf-hub-cache.
Also mount the host HF cache onto \$HF_HUB_CACHE inside the container so
tools reading the default HF path pick it up (matches the B200 dgxc-slurm
runner). Drop the /scratch/models Qwen3.5 path override since that path
is no longer used.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b300-nv.sh | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 51596b7b7..ecb24b1a1 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -248,14 +248,8 @@ find . -name '.nfs*' -delete 2>/dev/null || true
 
 else
 
-    HF_HUB_CACHE_MOUNT="/scratch/models"
-    # Qwen3.5-397B-A17B-FP8 is pre-staged under /scratch/models on the B300 cluster,
-    # so point MODEL at the local copy. Other models fall through and use `hf download`
-    # against the mounted cache from their benchmark script.
-    if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then
-        export MODEL="/scratch/models/${MODEL#*/}"
-    fi
-    SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    HF_HUB_CACHE_MOUNT="/data/home/sa-shared/gharunners/hf-hub-cache"
+    SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
     FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     LOCK_FILE="${SQUASH_FILE}.lock"
@@ -279,7 +273,7 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
+        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
         --no-container-mount-home \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \

From 744c5a0e3df14f0a9bc3b204cef787a4d9d58fb4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 02:24:50 -0500
Subject: [PATCH 08/24] fix: move enroot import out of srun to avoid pyxis
 namespace collision

Running two srun steps in the same allocation (flock+import, then the
benchmark --container-image srun) reproducibly fails on this cluster
with:
  error: pyxis: mkdir: cannot create directory
    '/scratch/data/user-$UID/pyxis_$JOBID.1/data': File exists
  error: pyxis:     [ERROR] /etc/enroot/hooks.d/10-shadow.sh exited with return code 1

Per NVIDIA/pyxis#138, two srun steps sharing an allocation can leave
enroot/pyxis state between steps. Collapsing to a single srun (the
benchmark) is the cleanest workaround. Move the flock-guarded
enroot import to the host side, before salloc.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b300-nv.sh | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index ecb24b1a1..f58d38abc 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -254,23 +254,29 @@ else
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     LOCK_FILE="${SQUASH_FILE}.lock"
 
+    # Import the squash file on the host (outside SLURM) rather than inside an
+    # srun step. Running two srun steps (one import, one benchmark) in the same
+    # allocation trips a pyxis namespace collision on this cluster:
+    #   error: pyxis: mkdir: cannot create directory
+    #       '/scratch/data/user-$UID/pyxis_$JOBID.1/data': File exists
+    # Collapsing to a single srun (the benchmark) avoids it entirely. flock
+    # serializes concurrent imports of the same squash by parallel GH jobs.
+    (
+        exec 9>"$LOCK_FILE"
+        flock -w 600 9 || { echo "Failed to acquire lock for $SQUASH_FILE" >&2; exit 1; }
+        if unsquashfs -l "$SQUASH_FILE" > /dev/null 2>&1; then
+            echo "Squash file already exists and is valid, skipping import"
+        else
+            rm -f "$SQUASH_FILE"
+            enroot import -o "$SQUASH_FILE" "docker://$IMAGE"
+        fi
+    )
+
     # Pin to one of the known-good B300 nodes; others have hardware/network
     # issues that cause benchmarks to hang or fail to start.
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
-    # Use flock to serialize concurrent imports to the same squash file
-    srun --jobid=$JOB_ID bash -c "
-        exec 9>\"$LOCK_FILE\"
-        flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
-        if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
-            echo 'Squash file already exists and is valid, skipping import'
-        else
-            rm -f \"$SQUASH_FILE\"
-            enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
-        fi
-    "
-
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
         --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \

From d003c59c5917175266dc0b30cf45d904d17800a7 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 02:34:34 -0500
Subject: [PATCH 09/24] fix: wipe stale pyxis scratch dirs for this JOB_ID
 before benchmark srun

Even with a single srun step, pyxis fails with
  error: pyxis: mkdir: cannot create directory
      '/scratch/data/user-$UID/pyxis_$JOBID.0/data': File exists
on fresh SLURM JOB_IDs. The /scratch path is left behind by previous
jobs whose IDs SLURM later reuses (and the cluster's pyxis epilog
doesn't clean it up). Wipe pyxis_$JOBID.* from the host after salloc;
no-op if /scratch is node-local, effective if it's shared NFS.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b300-nv.sh | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index f58d38abc..dbd4eae72 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -277,6 +277,14 @@ else
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
+    # Stale pyxis scratch from prior jobs with reused SLURM job IDs breaks the
+    # next container srun with
+    #   error: pyxis: mkdir: cannot create directory
+    #       '/scratch/data/user-$UID/pyxis_$JOBID.0/data': File exists
+    # If /scratch is shared across b300 nodes this cleanup works; if it's
+    # node-local it's a harmless no-op.
+    rm -rf "/scratch/data/user-$(id -u)/pyxis_${JOB_ID}."* 2>/dev/null || true
+
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
         --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \

From f00629fa68a602ceab888fe1407957bbd005b6b3 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 02:44:07 -0500
Subject: [PATCH 10/24] Revert: drop all B300 runner changes, mirror #1128's
 approach

PR #1128 (dsv4-fp4fp8-b300-vllm) runs on the same cluster with ZERO
changes to launch_b300-nv.sh. The pyxis 10-shadow.sh failures we were
chasing aren't caused by the runner -- reset it to origin/main and
keep the sglang config/bench additions only.

Reverts (from this branch):
- 4bb1f1ae point B300 runner at shared gharunners/{squash,hf-hub-cache}
- 106deeaa drop ENROOT_CACHE_PATH override
- 97a488e9 add flock-guarded squash import
- 744c5a0e move enroot import out of srun
- d003c59c wipe stale pyxis scratch before benchmark srun

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b300-nv.sh | 39 ++++++++++-----------------------------
 1 file changed, 10 insertions(+), 29 deletions(-)

diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index dbd4eae72..b49391a3c 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -248,46 +248,27 @@ find . -name '.nfs*' -delete 2>/dev/null || true
 
 else
 
-    HF_HUB_CACHE_MOUNT="/data/home/sa-shared/gharunners/hf-hub-cache"
-    SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    HF_HUB_CACHE_MOUNT="/scratch/models"
+    # Qwen3.5-397B-A17B-FP8 is pre-staged under /scratch/models on the B300 cluster,
+    # so point MODEL at the local copy. Other models fall through and use `hf download`
+    # against the mounted cache from their benchmark script.
+    if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then
+        export MODEL="/scratch/models/${MODEL#*/}"
+    fi
+    SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
     FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
-    LOCK_FILE="${SQUASH_FILE}.lock"
-
-    # Import the squash file on the host (outside SLURM) rather than inside an
-    # srun step. Running two srun steps (one import, one benchmark) in the same
-    # allocation trips a pyxis namespace collision on this cluster:
-    #   error: pyxis: mkdir: cannot create directory
-    #       '/scratch/data/user-$UID/pyxis_$JOBID.1/data': File exists
-    # Collapsing to a single srun (the benchmark) avoids it entirely. flock
-    # serializes concurrent imports of the same squash by parallel GH jobs.
-    (
-        exec 9>"$LOCK_FILE"
-        flock -w 600 9 || { echo "Failed to acquire lock for $SQUASH_FILE" >&2; exit 1; }
-        if unsquashfs -l "$SQUASH_FILE" > /dev/null 2>&1; then
-            echo "Squash file already exists and is valid, skipping import"
-        else
-            rm -f "$SQUASH_FILE"
-            enroot import -o "$SQUASH_FILE" "docker://$IMAGE"
-        fi
-    )
 
     # Pin to one of the known-good B300 nodes; others have hardware/network
     # issues that cause benchmarks to hang or fail to start.
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
-    # Stale pyxis scratch from prior jobs with reused SLURM job IDs breaks the
-    # next container srun with
-    #   error: pyxis: mkdir: cannot create directory
-    #       '/scratch/data/user-$UID/pyxis_$JOBID.0/data': File exists
-    # If /scratch is shared across b300 nodes this cleanup works; if it's
-    # node-local it's a harmless no-op.
-    rm -rf "/scratch/data/user-$(id -u)/pyxis_${JOB_ID}."* 2>/dev/null || true
+    srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
         --no-container-mount-home \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \

From 570b0ebcc39a003750edb1bc9c46512f1ab148bf Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 09:57:17 -0500
Subject: [PATCH 11/24] runner: add head-node flock-guarded squash import on
 B300

Move enroot import out of srun to the head node and serialize parallel
GH jobs with flock on the shared squash file. Skips the import when a
valid squash already exists. The benchmark srun is now the only step
in the allocation.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b300-nv.sh | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index b49391a3c..0dbc15d17 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -255,17 +255,30 @@ else
     if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then
         export MODEL="/scratch/models/${MODEL#*/}"
     fi
-    SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+    SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
     FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
+    LOCK_FILE="${SQUASH_FILE}.lock"
+
+    # Import the squash file on the head node (outside any srun) under flock.
+    # Parallel GH jobs target the same shared squash path; flock serializes
+    # imports so only one job pulls and writes the file while the rest wait.
+    (
+        exec 9>"$LOCK_FILE"
+        flock -w 600 9 || { echo "Failed to acquire lock for $SQUASH_FILE" >&2; exit 1; }
+        if unsquashfs -l "$SQUASH_FILE" > /dev/null 2>&1; then
+            echo "Squash file already exists and is valid, skipping import"
+        else
+            rm -f "$SQUASH_FILE"
+            enroot import -o "$SQUASH_FILE" "docker://$IMAGE"
+        fi
+    )
 
     # Pin to one of the known-good B300 nodes; others have hardware/network
     # issues that cause benchmarks to hang or fail to start.
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
-    srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
-
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
         --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \

From 864419d8b3c06ec31e2603db64ef68955acdb3ea Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 10:35:31 -0500
Subject: [PATCH 12/24] fix: mount at /ix and clear baked-in
 CUDA_VISIBLE_DEVICES

Port the B200 branch's fix for the lmsysorg/sglang:deepseek-v4-blackwell
image on B300:
- The image installs sglang editable under /workspace/sglang; the default
  $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and breaks
  'import sglang'. For this image, mount at /ix instead.
- The image's ENV bakes CUDA_VISIBLE_DEVICES=4,5,6,7, masking half the
  GPUs Slurm allocates. Unset it in the bench script so TP=8 sees all 8.
- Write artefacts under $PWD instead of hard-coded /workspace.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b300.sh | 12 ++++++++++--
 runners/launch_b300-nv.sh               | 15 +++++++++++++--
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh
index 89b87ac24..90c0e681a 100755
--- a/benchmarks/single_node/dsv4_fp4_b300.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300.sh
@@ -30,7 +30,15 @@ nvidia-smi
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
 
-SERVER_LOG=/workspace/server.log
+# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV,
+# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to
+# all ranks.
+unset CUDA_VISIBLE_DEVICES
+
+# The runner mounts this repo at a non-/workspace path for the deepseek-v4-blackwell
+# image (it installs sglang editable under /workspace/sglang, which our bind-mount
+# would hide), so write artefacts relative to $PWD instead of a hard-coded /workspace.
+SERVER_LOG="$PWD/server.log"
 PORT=${PORT:-8888}
 
 echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
@@ -75,7 +83,7 @@ run_benchmark_serving \
     --num-prompts "$((CONC * 10))" \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir "$PWD/"
 
 if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT"
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 0dbc15d17..8ce5481ba 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -260,6 +260,17 @@ else
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     LOCK_FILE="${SQUASH_FILE}.lock"
 
+    # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
+    # /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
+    # the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
+    # breaks `import sglang`. Mount this one image at /ix instead; drop the
+    # conditional once the image stops installing editable under /workspace.
+    if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
+        CONTAINER_MOUNT_DIR=/ix
+    else
+        CONTAINER_MOUNT_DIR=/workspace
+    fi
+
     # Import the squash file on the head node (outside any srun) under flock.
     # Parallel GH jobs target the same shared squash path; flock serializes
     # imports so only one job pulls and writes the file while the rest wait.
@@ -281,9 +292,9 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
+        --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
         --no-container-mount-home \
-        --container-workdir=/workspace/ \
+        --container-workdir=$CONTAINER_MOUNT_DIR \
         --no-container-entrypoint --export=ALL,PORT=8888 \
         bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b300${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
 

From 9453676d638370bd8c1ae9ab8c60c58829974ce7 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 10:58:09 -0500
Subject: [PATCH 13/24] runner: use /data/models pre-staged path for dsv4 on
 B300

Pre-staged models on the B300 cluster live under /data/models
(Qwen3.5-397B-A17B-FP8, dsv4-pro, etc.). Switch HF_HUB_CACHE_MOUNT
from /scratch/models to /data/models, and export MODEL to
/data/models/dsv4-pro when MODEL_PREFIX=dsv4 so the benchmark reads
from the mounted dir directly. The bench script skips `hf download`
when MODEL looks like an absolute path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b300.sh |  6 +++++-
 runners/launch_b300-nv.sh               | 13 ++++++++-----
 2 files changed, 13 insertions(+), 6 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh
index 90c0e681a..8ccbb9ead 100755
--- a/benchmarks/single_node/dsv4_fp4_b300.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300.sh
@@ -23,7 +23,11 @@ if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
 fi
 
-hf download "$MODEL"
+# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip
+# `hf download`. Only fetch when MODEL looks like a HF repo ID.
+if [[ "$MODEL" != /* ]]; then
+    hf download "$MODEL"
+fi
 
 nvidia-smi
 
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 8ce5481ba..76586238f 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -248,12 +248,15 @@ find . -name '.nfs*' -delete 2>/dev/null || true
 
 else
 
-    HF_HUB_CACHE_MOUNT="/scratch/models"
-    # Qwen3.5-397B-A17B-FP8 is pre-staged under /scratch/models on the B300 cluster,
-    # so point MODEL at the local copy. Other models fall through and use `hf download`
-    # against the mounted cache from their benchmark script.
+    # Pre-staged models on the B300 cluster live under /data/models. Point MODEL
+    # at the local copy so the benchmark skips `hf download` and reads from the
+    # mounted dir. Other models fall through and use `hf download` from their
+    # benchmark script.
+    HF_HUB_CACHE_MOUNT="/data/models"
     if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then
-        export MODEL="/scratch/models/${MODEL#*/}"
+        export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}"
+    elif [[ "$MODEL_PREFIX" == "dsv4" ]]; then
+        export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro"
     fi
     SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
     FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')

From 5db43b8b63c5f6affd71ae8a27c3c62d3aef5626 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 14:18:00 -0500
Subject: [PATCH 14/24] fix: switch B300 dsv4 sglang to bw-ultra-compiled image

The stock lmsysorg/sglang:deepseek-v4-blackwell image ships kernels
compiled for B200 (SM_100) and crashes on B300 with
  RuntimeError: RMSNorm failed with error code no kernel image is
  available for execution on the device
during CUDA graph capture. Switch to cquil/sglang-deepseek-v4-bw-ultra:v1,
which is recompiled with B300 SM support.

Broaden the /ix mount conditional to match both image tags: the fork
keeps the same /workspace/sglang editable install that would otherwise
be masked by $GITHUB_WORKSPACE:/workspace/.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml |  2 +-
 runners/launch_b300-nv.sh          | 11 ++++++-----
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index ea7ff6e9f..294cfe47f 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1802,7 +1802,7 @@ dsr1-fp8-b300-sglang:
 # until a B300-specific recipe ships. Prefix caching is disabled.
 # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
 dsv4-fp4-b300-sglang:
-  image: lmsysorg/sglang:deepseek-v4-blackwell
+  image: cquil/sglang-deepseek-v4-bw-ultra:v1
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 76586238f..cc357015c 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -263,12 +263,13 @@ else
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     LOCK_FILE="${SQUASH_FILE}.lock"
 
-    # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
-    # /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
-    # the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
-    # breaks `import sglang`. Mount this one image at /ix instead; drop the
+    # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell (and its B300-recompiled
+    # fork cquil/sglang-deepseek-v4-bw-ultra) installs sglang editable at
+    # /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang),
+    # so the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install
+    # and breaks `import sglang`. Mount these images at /ix instead; drop the
     # conditional once the image stops installing editable under /workspace.
-    if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
+    if [[ "$IMAGE" == *deepseek-v4-blackwell* || "$IMAGE" == *deepseek-v4-bw-ultra* ]]; then
         CONTAINER_MOUNT_DIR=/ix
     else
         CONTAINER_MOUNT_DIR=/workspace

From c060c58dae0a8fa6b8576e48ccf9e88a1d8a75a5 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 14:43:11 -0500
Subject: [PATCH 15/24] fix: switch B300 dsv4 sglang image to
 yhyang201/sglang-b300:v3

Use the B300-recompiled image from yhyang201; extend the /ix mount
conditional to match the new tag in addition to the previous
deepseek-v4-blackwell / deepseek-v4-bw-ultra patterns.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 2 +-
 runners/launch_b300-nv.sh          | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 294cfe47f..11c1a43f0 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1802,7 +1802,7 @@ dsr1-fp8-b300-sglang:
 # until a B300-specific recipe ships. Prefix caching is disabled.
 # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
 dsv4-fp4-b300-sglang:
-  image: cquil/sglang-deepseek-v4-bw-ultra:v1
+  image: yhyang201/sglang-b300:v3
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index cc357015c..600912877 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -263,13 +263,13 @@ else
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     LOCK_FILE="${SQUASH_FILE}.lock"
 
-    # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell (and its B300-recompiled
-    # fork cquil/sglang-deepseek-v4-bw-ultra) installs sglang editable at
-    # /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang),
+    # TODO(Cam): the deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell
+    # and its B300-recompiled forks like yhyang201/sglang-b300) install sglang
+    # editable at /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang),
     # so the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install
     # and breaks `import sglang`. Mount these images at /ix instead; drop the
     # conditional once the image stops installing editable under /workspace.
-    if [[ "$IMAGE" == *deepseek-v4-blackwell* || "$IMAGE" == *deepseek-v4-bw-ultra* ]]; then
+    if [[ "$IMAGE" == *deepseek-v4-blackwell* || "$IMAGE" == *deepseek-v4-bw-ultra* || "$IMAGE" == *sglang-b300* ]]; then
         CONTAINER_MOUNT_DIR=/ix
     else
         CONTAINER_MOUNT_DIR=/workspace

From 08edf26c59c3735ef4c01a41539fd155fcc39663 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 14:44:34 -0500
Subject: [PATCH 16/24] update b300

---
 benchmarks/single_node/dsv4_fp4_b300.sh | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh
index 8ccbb9ead..57932e929 100755
--- a/benchmarks/single_node/dsv4_fp4_b300.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300.sh
@@ -61,15 +61,17 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
---trust-remote-code \
---tensor-parallel-size=$TP --ep-size $EP_SIZE $DP_ATTN_ARGS \
---moe-a2a-backend deepep \
---deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' \
---mem-fraction-static 0.82 \
---cuda-graph-max-bs 64 \
---max-running-requests 256 \
---disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+PYTHONNOUSERSITE=1 sglang serve \
+    --model-path $MODEL \
+    --host 0.0.0.0 \
+    --port $PORT \
+    --trust-remote-code \
+    --tp $TP \
+    --moe-runner-backend flashinfer_mxfp4 \
+    --mem-fraction-static 0.82 \
+    --chunked-prefill-size 4096 \
+    --disable-flashinfer-autotune \
+    --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 

From a699ca091a331e5b7814c3695f7b79102fd5ac80 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 15:50:03 -0500
Subject: [PATCH 17/24] feat(dsv4-fp4-b300-sglang): pick recipe by CONC; split
 search-space

Mirror chore/dsv4-sgl-b200 commits 103a202c + 43be495b for B300:

Bench script now selects one of three cookbook recipes by CONC instead
of a single static flag set:
  CONC <= 32   -> low-latency    (TP only, chunked-prefill 4096,
                                  disable-flashinfer-autotune)
  33..128      -> balanced       (+ DP-attention, max-running-reqs=128,
                                  cuda-graph-max-bs=64, deepep-config)
  CONC > 128   -> max-throughput (+ DP-attention, max-running-reqs=256,
                                  cuda-graph-max-bs=64, deepep-config)
No speculative decoding in any recipe; --disable-radix-cache kept for
the no-prefix-caching baseline.

Split the dsv4-fp4-b300-sglang search-space rows per recipe boundary so
result filenames (ep=, dpa=) accurately reflect which recipe ran.
ep=8 on balanced/max-throughput reflects sglang's implicit
ep_size=tp_size override when --moe-a2a-backend deepep is set.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml      | 22 ++++++-
 benchmarks/single_node/dsv4_fp4_b300.sh | 87 ++++++++++++++++---------
 2 files changed, 78 insertions(+), 31 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 11c1a43f0..c9a3368cb 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1809,15 +1809,33 @@ dsv4-fp4-b300-sglang:
   precision: fp4
   framework: sglang
   multinode: false
+  # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
+  # are selected inside benchmarks/single_node/dsv4_fp4_b300.sh by CONC:
+  #   low-latency    (CONC <= 32):       TP-only
+  #   balanced       (32 < CONC <= 128): + DP-attn
+  #   max-throughput (CONC > 128):       + DP-attn
+  # Split so result filenames (ep=, dpa=) accurately reflect the recipe.
+  # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
+  # while low-latency leaves ep_size at the default of 1.
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 1024 }
+    # low-latency
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
+    # balanced
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
+    # max-throughput
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 512 }
+    # low-latency
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
+    # balanced
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
+    # max-throughput
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }
 
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh
index 57932e929..faa946174 100755
--- a/benchmarks/single_node/dsv4_fp4_b300.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300.sh
@@ -1,11 +1,5 @@
 #!/usr/bin/env bash
 
-# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-# only ships a B200 recipe for Blackwell. This script reuses the B200
-# DeepSeek-V4-Pro Max-Throughput recipe (DP=8 + DeepEP, no MTP) as-is on
-# B300 until a B300-specific recipe ships. Parallelism and concurrency
-# ranges mirror dsv4-fp4-b200-vllm. Prefix caching is disabled.
-
 source "$(dirname "$0")/../benchmark_lib.sh"
 
 check_env_vars \
@@ -15,9 +9,7 @@ check_env_vars \
     ISL \
     OSL \
     RANDOM_RANGE_RATIO \
-    RESULT_FILENAME \
-    EP_SIZE \
-    DP_ATTENTION
+    RESULT_FILENAME
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
@@ -32,25 +24,23 @@ fi
 nvidia-smi
 
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
-export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
 
-# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV,
-# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to
-# all ranks.
+# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its
+# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half
+# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks.
 unset CUDA_VISIBLE_DEVICES
 
-# The runner mounts this repo at a non-/workspace path for the deepseek-v4-blackwell
-# image (it installs sglang editable under /workspace/sglang, which our bind-mount
-# would hide), so write artefacts relative to $PWD instead of a hard-coded /workspace.
+# TODO(Cam): the deepseek-v4 sglang images install sglang editable at
+# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
+# The runner mounts our repo at a non-/workspace path for these images so the
+# editable install stays visible. Paths in this script are $PWD-relative for
+# that reason. Drop the runner conditional once lmsys moves sglang back out of
+# /workspace.
+
 SERVER_LOG="$PWD/server.log"
 PORT=${PORT:-8888}
 
-echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"
-
-DP_ATTN_ARGS=""
-if [ "$DP_ATTENTION" = "true" ]; then
-    DP_ATTN_ARGS="--data-parallel-size $TP --enable-dp-attention"
-fi
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -58,7 +48,49 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
 fi
 
-start_gpu_monitor
+start_gpu_monitor --output "$PWD/gpu_metrics.csv"
+
+# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
+# (spec-decoding / MTP and prefix-caching flags dropped for the baseline):
+#   - low-latency    (CONC <= 32):        TP-only, chunked-prefill, disable autotune
+#   - balanced       (32 < CONC <= 128):  + DP-attn, max-running-requests=128
+#   - max-throughput (CONC > 128):        + DP-attn, max-running-requests=256
+DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+if [[ $CONC -le 32 ]]; then
+    RECIPE=low-latency
+    RECIPE_FLAGS=(
+        --moe-runner-backend flashinfer_mxfp4
+        --chunked-prefill-size 4096
+        --disable-flashinfer-autotune
+        --mem-fraction-static 0.82
+    )
+elif [[ $CONC -le 128 ]]; then
+    RECIPE=balanced
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
+    RECIPE_FLAGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+        --mem-fraction-static 0.82
+        --cuda-graph-max-bs 64
+        --max-running-requests 128
+    )
+else
+    RECIPE=max-throughput
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
+    RECIPE_FLAGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+        --mem-fraction-static 0.82
+        --cuda-graph-max-bs 64
+        --max-running-requests 256
+    )
+fi
+echo "Recipe: $RECIPE (CONC=$CONC)"
 
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
@@ -67,11 +99,8 @@ PYTHONNOUSERSITE=1 sglang serve \
     --port $PORT \
     --trust-remote-code \
     --tp $TP \
-    --moe-runner-backend flashinfer_mxfp4 \
-    --mem-fraction-static 0.82 \
-    --chunked-prefill-size 4096 \
-    --disable-flashinfer-autotune \
-    --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+    --disable-radix-cache \
+    "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 
@@ -86,7 +115,7 @@ run_benchmark_serving \
     --input-len "$ISL" \
     --output-len "$OSL" \
     --random-range-ratio "$RANDOM_RANGE_RATIO" \
-    --num-prompts "$((CONC * 10))" \
+    --num-prompts $((CONC * 10)) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
     --result-dir "$PWD/"

From d35696cab3b0e1c51f6ae2334b0a0c36b058e62c Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 15:56:58 -0500
Subject: [PATCH 18/24] update b300

Switch B300 dsv4 sglang image to lmsysorg/sglang:deepseek-v4-b300
and extend the /ix mount conditional to match the new tag.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 2 +-
 runners/launch_b300-nv.sh          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index c9a3368cb..1c9f9beba 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1802,7 +1802,7 @@ dsr1-fp8-b300-sglang:
 # until a B300-specific recipe ships. Prefix caching is disabled.
 # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm.
 dsv4-fp4-b300-sglang:
-  image: yhyang201/sglang-b300:v3
+  image: lmsysorg/sglang:deepseek-v4-b300
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b300
diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh
index 600912877..3daac0167 100644
--- a/runners/launch_b300-nv.sh
+++ b/runners/launch_b300-nv.sh
@@ -269,7 +269,7 @@ else
     # so the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install
     # and breaks `import sglang`. Mount these images at /ix instead; drop the
     # conditional once the image stops installing editable under /workspace.
-    if [[ "$IMAGE" == *deepseek-v4-blackwell* || "$IMAGE" == *deepseek-v4-bw-ultra* || "$IMAGE" == *sglang-b300* ]]; then
+    if [[ "$IMAGE" == *deepseek-v4-blackwell* || "$IMAGE" == *deepseek-v4-bw-ultra* || "$IMAGE" == *deepseek-v4-b300* || "$IMAGE" == *sglang-b300* ]]; then
         CONTAINER_MOUNT_DIR=/ix
     else
         CONTAINER_MOUNT_DIR=/workspace

From bc43672775655dee5e1e5666bb6f03cb5d876e5e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 16:40:55 -0500
Subject: [PATCH 19/24] feat(dsv4-fp4-b300-sglang): hardcode low-latency recipe
 at every CONC

The DeepEP FP8 weight-postprocess path is broken for
deepseek-ai/DeepSeek-V4-Pro on B300 with
lmsysorg/sglang:deepseek-v4-b300 -- every sglang launch with
--moe-a2a-backend deepep fails during model load with
  RuntimeError: Recipe must be a list/tuple of 3 integers.
raised from sglang.srt.layers.quantization.fp8
.process_weights_after_loading_block_quant (fp8.py:957). The balanced
and max-throughput recipes both go through that path; the low-latency
recipe (TP-only, flashinfer_mxfp4 MoE) does not and loads cleanly.

Collapse the yaml search-space back to a single row spanning the full
CONC range (4..1024 for 1k1k, 4..512 for 8k1k) and hardcode the bench
script to the low-latency flags at every CONC. TODO(Cam) noted in both
files to restore the recipe-per-CONC dispatch once the DeepEP FP8 load
path is fixed upstream.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml      | 29 +++++---------
 benchmarks/single_node/dsv4_fp4_b300.sh | 53 ++++++-------------------
 2 files changed, 22 insertions(+), 60 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 1c9f9beba..ea71490bd 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1809,33 +1809,22 @@ dsv4-fp4-b300-sglang:
   precision: fp4
   framework: sglang
   multinode: false
-  # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-  # are selected inside benchmarks/single_node/dsv4_fp4_b300.sh by CONC:
-  #   low-latency    (CONC <= 32):       TP-only
-  #   balanced       (32 < CONC <= 128): + DP-attn
-  #   max-throughput (CONC > 128):       + DP-attn
-  # Split so result filenames (ep=, dpa=) accurately reflect the recipe.
-  # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
-  # while low-latency leaves ep_size at the default of 1.
+  # TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP)
+  # while the DeepEP FP8 weight-postprocess path is broken for this
+  # checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3
+  # integers. raised from sglang.srt.layers.quantization.fp8
+  # .process_weights_after_loading_block_quant). Full concurrency sweep
+  # retained; restore the recipe-per-CONC split (balanced + max-throughput
+  # rows) once sglang can load the checkpoint under --moe-a2a-backend deepep.
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    # low-latency
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
-    # balanced
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
-    # max-throughput
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
   - isl: 8192
     osl: 1024
     search-space:
-    # low-latency
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
-    # balanced
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
-    # max-throughput
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }
 
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh
index faa946174..79856c2ec 100755
--- a/benchmarks/single_node/dsv4_fp4_b300.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300.sh
@@ -50,46 +50,19 @@ fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
-# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-# (spec-decoding / MTP and prefix-caching flags dropped for the baseline):
-#   - low-latency    (CONC <= 32):        TP-only, chunked-prefill, disable autotune
-#   - balanced       (32 < CONC <= 128):  + DP-attn, max-running-requests=128
-#   - max-throughput (CONC > 128):        + DP-attn, max-running-requests=256
-DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
-
-if [[ $CONC -le 32 ]]; then
-    RECIPE=low-latency
-    RECIPE_FLAGS=(
-        --moe-runner-backend flashinfer_mxfp4
-        --chunked-prefill-size 4096
-        --disable-flashinfer-autotune
-        --mem-fraction-static 0.82
-    )
-elif [[ $CONC -le 128 ]]; then
-    RECIPE=balanced
-    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
-    RECIPE_FLAGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-a2a-backend deepep
-        --deepep-config "$DEEPEP_CONFIG"
-        --mem-fraction-static 0.82
-        --cuda-graph-max-bs 64
-        --max-running-requests 128
-    )
-else
-    RECIPE=max-throughput
-    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
-    RECIPE_FLAGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-a2a-backend deepep
-        --deepep-config "$DEEPEP_CONFIG"
-        --mem-fraction-static 0.82
-        --cuda-graph-max-bs 64
-        --max-running-requests 256
-    )
-fi
+# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the
+# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300
+# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from
+# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant).
+# Restore the CONC-based low-latency / balanced / max-throughput dispatch
+# once sglang can load the checkpoint under --moe-a2a-backend deepep.
+RECIPE=low-latency
+RECIPE_FLAGS=(
+    --moe-runner-backend flashinfer_mxfp4
+    --chunked-prefill-size 4096
+    --disable-flashinfer-autotune
+    --mem-fraction-static 0.82
+)
 echo "Recipe: $RECIPE (CONC=$CONC)"
 
 set -x

From 87c83764218be53bd4b5079d583b299a0c7e0792 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Sat, 25 Apr 2026 00:11:03 -0500
Subject: [PATCH 20/24] trigger test check


From 90e8f3d8c32e04100eb1dc3635e5a3d82ab1ad88 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Sat, 25 Apr 2026 00:25:02 -0500
Subject: [PATCH 21/24] Revert "feat(dsv4-fp4-b300-sglang): hardcode
 low-latency recipe at every CONC"

This reverts commit bc43672775655dee5e1e5666bb6f03cb5d876e5e.
---
 .github/configs/nvidia-master.yaml      | 29 +++++++++-----
 benchmarks/single_node/dsv4_fp4_b300.sh | 53 +++++++++++++++++++------
 2 files changed, 60 insertions(+), 22 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index bd8d5bddc..42c720a63 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1845,22 +1845,33 @@ dsv4-fp4-b300-sglang:
   precision: fp4
   framework: sglang
   multinode: false
-  # TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP)
-  # while the DeepEP FP8 weight-postprocess path is broken for this
-  # checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3
-  # integers. raised from sglang.srt.layers.quantization.fp8
-  # .process_weights_after_loading_block_quant). Full concurrency sweep
-  # retained; restore the recipe-per-CONC split (balanced + max-throughput
-  # rows) once sglang can load the checkpoint under --moe-a2a-backend deepep.
+  # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
+  # are selected inside benchmarks/single_node/dsv4_fp4_b300.sh by CONC:
+  #   low-latency    (CONC <= 32):       TP-only
+  #   balanced       (32 < CONC <= 128): + DP-attn
+  #   max-throughput (CONC > 128):       + DP-attn
+  # Split so result filenames (ep=, dpa=) accurately reflect the recipe.
+  # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
+  # while low-latency leaves ep_size at the default of 1.
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
+    # low-latency
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
+    # balanced
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
+    # max-throughput
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }
+    # low-latency
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
+    # balanced
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
+    # max-throughput
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }
 
 qwen3.5-bf16-b200-sglang:
   image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh
index 79856c2ec..faa946174 100755
--- a/benchmarks/single_node/dsv4_fp4_b300.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300.sh
@@ -50,19 +50,46 @@ fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
-# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the
-# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300
-# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from
-# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant).
-# Restore the CONC-based low-latency / balanced / max-throughput dispatch
-# once sglang can load the checkpoint under --moe-a2a-backend deepep.
-RECIPE=low-latency
-RECIPE_FLAGS=(
-    --moe-runner-backend flashinfer_mxfp4
-    --chunked-prefill-size 4096
-    --disable-flashinfer-autotune
-    --mem-fraction-static 0.82
-)
+# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
+# (spec-decoding / MTP and prefix-caching flags dropped for the baseline):
+#   - low-latency    (CONC <= 32):        TP-only, chunked-prefill, disable autotune
+#   - balanced       (32 < CONC <= 128):  + DP-attn, max-running-requests=128
+#   - max-throughput (CONC > 128):        + DP-attn, max-running-requests=256
+DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+if [[ $CONC -le 32 ]]; then
+    RECIPE=low-latency
+    RECIPE_FLAGS=(
+        --moe-runner-backend flashinfer_mxfp4
+        --chunked-prefill-size 4096
+        --disable-flashinfer-autotune
+        --mem-fraction-static 0.82
+    )
+elif [[ $CONC -le 128 ]]; then
+    RECIPE=balanced
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
+    RECIPE_FLAGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+        --mem-fraction-static 0.82
+        --cuda-graph-max-bs 64
+        --max-running-requests 128
+    )
+else
+    RECIPE=max-throughput
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
+    RECIPE_FLAGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+        --mem-fraction-static 0.82
+        --cuda-graph-max-bs 64
+        --max-running-requests 256
+    )
+fi
 echo "Recipe: $RECIPE (CONC=$CONC)"
 
 set -x

From 8e3158d4cabf90f995c5cfa5dd0f918dbe782012 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Sat, 25 Apr 2026 00:34:52 -0500
Subject: [PATCH 22/24] trigger test check


From 623baa1a4dc8ce91fd86bd6a926dd6d76593125d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Sat, 25 Apr 2026 01:40:21 -0500
Subject: [PATCH 23/24] Move dsv4 b300 sglang bench script to framework-tagged
 path

Per the runner naming convention introduced in #1146
(BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"), the b300
runner now prefers benchmarks/single_node/dsv4_fp4_b300_sglang.sh over
the legacy dsv4_fp4_b300.sh. The merge from main left this branch with
both scripts: the legacy file carrying the recipe-per-CONC dispatch
this PR added, and the framework-tagged file with the low-latency-only
fallback content from main. CI was therefore picking the wrong script.

Move the recipe-per-CONC dispatch onto dsv4_fp4_b300_sglang.sh and
delete the legacy filename so the runner picks up the intended logic.
Update the yaml comment to point at the new path.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b300.sh       | 129 ------------------
 .../single_node/dsv4_fp4_b300_sglang.sh       |  54 ++++++--
 2 files changed, 40 insertions(+), 143 deletions(-)
 delete mode 100755 benchmarks/single_node/dsv4_fp4_b300.sh

diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh
deleted file mode 100755
index faa946174..000000000
--- a/benchmarks/single_node/dsv4_fp4_b300.sh
+++ /dev/null
@@ -1,129 +0,0 @@
-#!/usr/bin/env bash
-
-source "$(dirname "$0")/../benchmark_lib.sh"
-
-check_env_vars \
-    MODEL \
-    TP \
-    CONC \
-    ISL \
-    OSL \
-    RANDOM_RANGE_RATIO \
-    RESULT_FILENAME
-
-if [[ -n "$SLURM_JOB_ID" ]]; then
-  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
-fi
-
-# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip
-# `hf download`. Only fetch when MODEL looks like a HF repo ID.
-if [[ "$MODEL" != /* ]]; then
-    hf download "$MODEL"
-fi
-
-nvidia-smi
-
-export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
-
-# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its
-# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half
-# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks.
-unset CUDA_VISIBLE_DEVICES
-
-# TODO(Cam): the deepseek-v4 sglang images install sglang editable at
-# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
-# The runner mounts our repo at a non-/workspace path for these images so the
-# editable install stays visible. Paths in this script are $PWD-relative for
-# that reason. Drop the runner conditional once lmsys moves sglang back out of
-# /workspace.
-
-SERVER_LOG="$PWD/server.log"
-PORT=${PORT:-8888}
-
-echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
-
-EVAL_CONTEXT_ARGS=""
-if [ "${EVAL_ONLY}" = "true" ]; then
-    setup_eval_context
-    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
-fi
-
-start_gpu_monitor --output "$PWD/gpu_metrics.csv"
-
-# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-# (spec-decoding / MTP and prefix-caching flags dropped for the baseline):
-#   - low-latency    (CONC <= 32):        TP-only, chunked-prefill, disable autotune
-#   - balanced       (32 < CONC <= 128):  + DP-attn, max-running-requests=128
-#   - max-throughput (CONC > 128):        + DP-attn, max-running-requests=256
-DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
-
-if [[ $CONC -le 32 ]]; then
-    RECIPE=low-latency
-    RECIPE_FLAGS=(
-        --moe-runner-backend flashinfer_mxfp4
-        --chunked-prefill-size 4096
-        --disable-flashinfer-autotune
-        --mem-fraction-static 0.82
-    )
-elif [[ $CONC -le 128 ]]; then
-    RECIPE=balanced
-    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
-    RECIPE_FLAGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-a2a-backend deepep
-        --deepep-config "$DEEPEP_CONFIG"
-        --mem-fraction-static 0.82
-        --cuda-graph-max-bs 64
-        --max-running-requests 128
-    )
-else
-    RECIPE=max-throughput
-    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
-    RECIPE_FLAGS=(
-        --dp-size "$TP"
-        --enable-dp-attention
-        --moe-a2a-backend deepep
-        --deepep-config "$DEEPEP_CONFIG"
-        --mem-fraction-static 0.82
-        --cuda-graph-max-bs 64
-        --max-running-requests 256
-    )
-fi
-echo "Recipe: $RECIPE (CONC=$CONC)"
-
-set -x
-PYTHONNOUSERSITE=1 sglang serve \
-    --model-path $MODEL \
-    --host 0.0.0.0 \
-    --port $PORT \
-    --trust-remote-code \
-    --tp $TP \
-    --disable-radix-cache \
-    "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
-
-SERVER_PID=$!
-
-wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
-
-pip install -q datasets pandas
-
-run_benchmark_serving \
-    --model "$MODEL" \
-    --port "$PORT" \
-    --backend vllm \
-    --input-len "$ISL" \
-    --output-len "$OSL" \
-    --random-range-ratio "$RANDOM_RANGE_RATIO" \
-    --num-prompts $((CONC * 10)) \
-    --max-concurrency "$CONC" \
-    --result-filename "$RESULT_FILENAME" \
-    --result-dir "$PWD/"
-
-if [ "${RUN_EVAL}" = "true" ]; then
-    run_eval --framework lm-eval --port "$PORT"
-    append_lm_eval_summary
-fi
-
-stop_gpu_monitor
-set +x
diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
index c9fb238a5..faa946174 100755
--- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
+++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh
@@ -50,20 +50,46 @@ fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
-# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the
-# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300
-# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from
-# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant).
-# Restore the CONC-based low-latency / balanced / max-throughput dispatch
-# on chore/dsv4-sgl-b300 once sglang can load the checkpoint under
-# --moe-a2a-backend deepep.
-RECIPE=low-latency
-RECIPE_FLAGS=(
-    --moe-runner-backend flashinfer_mxfp4
-    --chunked-prefill-size 4096
-    --disable-flashinfer-autotune
-    --mem-fraction-static 0.82
-)
+# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
+# (spec-decoding / MTP and prefix-caching flags dropped for the baseline):
+#   - low-latency    (CONC <= 32):        TP-only, chunked-prefill, disable autotune
+#   - balanced       (32 < CONC <= 128):  + DP-attn, max-running-requests=128
+#   - max-throughput (CONC > 128):        + DP-attn, max-running-requests=256
+DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
+if [[ $CONC -le 32 ]]; then
+    RECIPE=low-latency
+    RECIPE_FLAGS=(
+        --moe-runner-backend flashinfer_mxfp4
+        --chunked-prefill-size 4096
+        --disable-flashinfer-autotune
+        --mem-fraction-static 0.82
+    )
+elif [[ $CONC -le 128 ]]; then
+    RECIPE=balanced
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
+    RECIPE_FLAGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+        --mem-fraction-static 0.82
+        --cuda-graph-max-bs 64
+        --max-running-requests 128
+    )
+else
+    RECIPE=max-throughput
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
+    RECIPE_FLAGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+        --mem-fraction-static 0.82
+        --cuda-graph-max-bs 64
+        --max-running-requests 256
+    )
+fi
 echo "Recipe: $RECIPE (CONC=$CONC)"
 
 set -x

From 54b2ced5e80684c02c999dd3da8d61c5bb44a838 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Sat, 25 Apr 2026 01:43:18 -0500
Subject: [PATCH 24/24] chore(perf-changelog): tighten dsv4-fp4-b300-sglang
 entry

Now that DeepEP FP8 loads cleanly, this PR is purely about restoring
the recipe-per-CONC split on top of the low-latency-only fallback
from #1143. Trim the changelog to that delta.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 perf-changelog.yaml | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 70593a980..397da6591 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1816,10 +1816,6 @@
 - config-keys:
     - dsv4-fp4-b300-sglang
   description:
-    - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark"
-    - "Image: lmsysorg/sglang:deepseek-v4-blackwell"
-    - "Model: deepseek-ai/DeepSeek-V4-Pro (FP4 MoE experts + FP8 attention/dense)"
-    - "Reuses the B200 Pro Max-Throughput recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 on B300 until a B300-specific recipe ships"
-    - "DP=8 + DeepEP, prefix caching disabled, no speculative decoding"
-    - "Parallelism (TP=8/EP=8/dp-attn=true) and concurrency ranges (4-1024 for 1k1k, 4-512 for 8k1k) mirror dsv4-fp4-b200-vllm"
+    - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again"
+    - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
   pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132