From f9a0ed9f8614be5aabf381cb4c644d129ccaf63e Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:10:08 -0500
Subject: [PATCH 01/20] Add dsv4-fp4-b200-sglang single-node config

Adds the DeepSeek-V4-Flash B200 SGLang recipe from
https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4.
Prefix caching and speculative decoding are disabled for baseline numbers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml      | 18 ++++++
 benchmarks/single_node/dsv4_fp4_b200.sh | 75 +++++++++++++++++++++++++
 perf-changelog.yaml                     |  9 +++
 3 files changed, 102 insertions(+)
 create mode 100755 benchmarks/single_node/dsv4_fp4_b200.sh

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 96273444f..49be01a98 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1669,6 +1669,24 @@ dsr1-fp4-b200-sglang:
     - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
     - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }
 
+dsv4-fp4-b200-sglang:
+  image: lmsysorg/sglang:deepseek-v4-blackwell
+  model: deepseek-ai/DeepSeek-V4-Flash
+  model-prefix: dsv4
+  runner: b200
+  precision: fp4
+  framework: sglang
+  multinode: false
+  seq-len-configs:
+  - isl: 1024
+    osl: 1024
+    search-space:
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+  - isl: 8192
+    osl: 1024
+    search-space:
+    - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 }
+
 # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4
 # B200 SGLang recipe as-is until B300-specific tuning is available.
diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
new file mode 100755
index 000000000..7faa661b2
--- /dev/null
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -0,0 +1,75 @@
+#!/usr/bin/env bash
+
+source "$(dirname "$0")/../benchmark_lib.sh"
+
+check_env_vars \
+    MODEL \
+    TP \
+    CONC \
+    ISL \
+    OSL \
+    RANDOM_RANGE_RATIO \
+    RESULT_FILENAME \
+    EP_SIZE
+
+if [[ -n "$SLURM_JOB_ID" ]]; then
+  echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
+fi
+
+hf download "$MODEL"
+
+nvidia-smi
+
+export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
+
+SERVER_LOG=/workspace/server.log
+PORT=${PORT:-8888}
+
+if [[ $CONC -ge 16 ]]; then
+  SCHEDULER_RECV_INTERVAL=30
+else
+  SCHEDULER_RECV_INTERVAL=10
+fi
+echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+
+EVAL_CONTEXT_ARGS=""
+if [ "${EVAL_ONLY}" = "true" ]; then
+    setup_eval_context
+    EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
+fi
+
+start_gpu_monitor
+
+set -x
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
+--tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
+--moe-runner-backend flashinfer_mxfp4 --moe-a2a-backend deepep \
+--chunked-prefill-size 4096 --disable-flashinfer-autotune \
+--scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
+--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+
+SERVER_PID=$!
+
+wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"
+
+pip install -q datasets pandas
+
+run_benchmark_serving \
+    --model "$MODEL" \
+    --port "$PORT" \
+    --backend vllm \
+    --input-len "$ISL" \
+    --output-len "$OSL" \
+    --random-range-ratio "$RANDOM_RANGE_RATIO" \
+    --num-prompts $((CONC * 10)) \
+    --max-concurrency "$CONC" \
+    --result-filename "$RESULT_FILENAME" \
+    --result-dir /workspace/
+
+if [ "${RUN_EVAL}" = "true" ]; then
+    run_eval --framework lm-eval --port "$PORT"
+    append_lm_eval_summary
+fi
+
+stop_gpu_monitor
+set +x
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 2b2e138c8..7dd1629b8 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,3 +1,12 @@
+- config-keys:
+    - dsv4-fp4-b200-sglang
+  description:
+    - "Add DeepSeek-V4-Flash single-node B200 SGLang benchmark (TP4, FP4 MoE + FP8 dense)"
+    - "Container: lmsysorg/sglang:deepseek-v4-blackwell"
+    - "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+    - "Prefix caching and speculative decoding disabled for baseline numbers"
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD
+
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt
     - dsr1-fp8-h100-dynamo-sglang

From 44a1c1f490dc19f02dd41914278c47fbfece445d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:18:56 -0500
Subject: [PATCH 02/20] Switch dsv4-fp4-b200-sglang to Pro model, match vllm
 parallelism

Uses deepseek-ai/DeepSeek-V4-Pro with tp=8, ep=8, dp-attention enabled
and sweep concurrency ranges aligned with dsv4-fp4-b200-vllm (4-1024 at
1k/1k, 4-512 at 8k/1k). Script now passes --enable-dp-attention when
DP_ATTENTION=true and sets --mem-fraction-static per the Pro recipe.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml      |  6 +++---
 benchmarks/single_node/dsv4_fp4_b200.sh | 13 ++++++++++---
 perf-changelog.yaml                     |  5 +++--
 3 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 49be01a98..3a4695665 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1671,7 +1671,7 @@ dsr1-fp4-b200-sglang:
 
 dsv4-fp4-b200-sglang:
   image: lmsysorg/sglang:deepseek-v4-blackwell
-  model: deepseek-ai/DeepSeek-V4-Flash
+  model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
   runner: b200
   precision: fp4
@@ -1681,11 +1681,11 @@ dsv4-fp4-b200-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 1024 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 }
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 512 }
 
 # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4
diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 7faa661b2..c5860e868 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -10,7 +10,8 @@ check_env_vars \
     OSL \
     RANDOM_RANGE_RATIO \
     RESULT_FILENAME \
-    EP_SIZE
+    EP_SIZE \
+    DP_ATTENTION
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
@@ -30,7 +31,12 @@ if [[ $CONC -ge 16 ]]; then
 else
   SCHEDULER_RECV_INTERVAL=10
 fi
-echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL"
+echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL, TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
+
+DP_ATTN_ARGS=""
+if [[ "$DP_ATTENTION" == "true" ]]; then
+  DP_ATTN_ARGS="--enable-dp-attention --dp-size $TP"
+fi
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -42,8 +48,9 @@ start_gpu_monitor
 
 set -x
 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
---tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \
+--tensor-parallel-size=$TP --ep-size $EP_SIZE $DP_ATTN_ARGS \
 --moe-runner-backend flashinfer_mxfp4 --moe-a2a-backend deepep \
+--mem-fraction-static 0.82 \
 --chunked-prefill-size 4096 --disable-flashinfer-autotune \
 --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
 --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
diff --git a/perf-changelog.yaml b/perf-changelog.yaml
index 7dd1629b8..45c0c8ebf 100644
--- a/perf-changelog.yaml
+++ b/perf-changelog.yaml
@@ -1,11 +1,12 @@
 - config-keys:
     - dsv4-fp4-b200-sglang
   description:
-    - "Add DeepSeek-V4-Flash single-node B200 SGLang benchmark (TP4, FP4 MoE + FP8 dense)"
+    - "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark (TP8, EP8, dp-attention)"
     - "Container: lmsysorg/sglang:deepseek-v4-blackwell"
     - "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
+    - "Parallelism and sweep conc ranges match the dsv4-fp4-b200-vllm config"
     - "Prefix caching and speculative decoding disabled for baseline numbers"
-  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD
+  pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131
 
 - config-keys:
     - dsr1-fp8-h100-dynamo-trt

From c21ee5cce0f9a2c6d2d223d49b0248e9cddf34e4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:22:15 -0500
Subject: [PATCH 03/20] Match DSV4 Pro SGLang recipe literally; port HF cache
 path

Server launch now mirrors the DeepSeek-V4-Pro command from
https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4:
--tp N, --moe-runner-backend flashinfer_mxfp4, --mem-fraction-static
0.82, SGLANG_JIT_DEEPGEMM_PRECOMPILE=0. Speculative decoding omitted
and --disable-radix-cache added per the no-spec / no-prefix-cache
baseline. YAML search-space drops ep/dp-attn to tp=8, ep=1.

Also syncs runners/launch_b200-dgxc-slurm.sh with the HF cache mount
path from origin/claude/add-dsv4-fp4-b200-vllm so both PRs stay in
agreement on runner layout.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml      |  4 ++--
 benchmarks/single_node/dsv4_fp4_b200.sh | 22 ++++------------------
 runners/launch_b200-dgxc-slurm.sh       |  5 ++---
 3 files changed, 8 insertions(+), 23 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 3a4695665..9e57fb398 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1681,11 +1681,11 @@ dsv4-fp4-b200-sglang:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 1024 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 512 }
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }
 
 # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4
diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index c5860e868..0ed538599 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -9,9 +9,7 @@ check_env_vars \
     ISL \
     OSL \
     RANDOM_RANGE_RATIO \
-    RESULT_FILENAME \
-    EP_SIZE \
-    DP_ATTENTION
+    RESULT_FILENAME
 
 if [[ -n "$SLURM_JOB_ID" ]]; then
   echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
@@ -26,17 +24,7 @@ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
-if [[ $CONC -ge 16 ]]; then
-  SCHEDULER_RECV_INTERVAL=30
-else
-  SCHEDULER_RECV_INTERVAL=10
-fi
-echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL, TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION"
-
-DP_ATTN_ARGS=""
-if [[ "$DP_ATTENTION" == "true" ]]; then
-  DP_ATTN_ARGS="--enable-dp-attention --dp-size $TP"
-fi
+echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
 
 EVAL_CONTEXT_ARGS=""
 if [ "${EVAL_ONLY}" = "true" ]; then
@@ -48,11 +36,9 @@ start_gpu_monitor
 
 set -x
 PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
---tensor-parallel-size=$TP --ep-size $EP_SIZE $DP_ATTN_ARGS \
---moe-runner-backend flashinfer_mxfp4 --moe-a2a-backend deepep \
+--tp $TP \
+--moe-runner-backend flashinfer_mxfp4 \
 --mem-fraction-static 0.82 \
---chunked-prefill-size 4096 --disable-flashinfer-autotune \
---scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \
 --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index c0f25310b..b9d4d90cc 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -249,8 +249,7 @@ EOF
 
 else
 
-    HF_HUB_CACHE_MOUNT="/scratch/fsw/models"
-    export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}"
+    HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache"
     SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
     FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
@@ -276,7 +275,7 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
+        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
         --no-container-mount-home \
         --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \

From 039977307b42c6c6c67b325dfe827b022133e5fc Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:27:00 -0500
Subject: [PATCH 04/20] fix: use 'sglang serve' CLI, not python -m
 sglang.launch_server

The deepseek-v4-blackwell image doesn't expose sglang via system
python3, so the module import fails:

  /usr/bin/python3: Error while finding module specification for
  'sglang.launch_server' (ModuleNotFoundError: No module named 'sglang')

Switch to the `sglang serve` entrypoint that the cookbook uses; the
CLI resolves the correct interpreter.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 0ed538599..0f443415a 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -35,7 +35,7 @@ fi
 start_gpu_monitor
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
+sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
 --tp $TP \
 --moe-runner-backend flashinfer_mxfp4 \
 --mem-fraction-static 0.82 \

From 4a3e3e95bd6378cdec4d0b632d58bae3a6d46a52 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:42:29 -0500
Subject: [PATCH 05/20] fix: mount repo at /ix for deepseek-v4-blackwell image
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The lmsysorg/sglang:deepseek-v4-blackwell image installs sglang editable
at /workspace/sglang/python — unlike every prior sglang tag which uses
/sgl-workspace/sglang. Our $GITHUB_WORKSPACE:/workspace/ bind-mount
masks that directory, breaking `import sglang`.

Conditionally mount at /ix for this image only and make the dsv4
benchmark script use $PWD for server/metrics/result paths so it works
regardless of the mount target. All other configs still mount at
/workspace.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh |  6 +++---
 runners/launch_b200-dgxc-slurm.sh       | 13 +++++++++++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 0f443415a..598fbc77d 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -21,7 +21,7 @@ nvidia-smi
 
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 
-SERVER_LOG=/workspace/server.log
+SERVER_LOG="$PWD/server.log"
 PORT=${PORT:-8888}
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
@@ -32,7 +32,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
 fi
 
-start_gpu_monitor
+start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
 set -x
 sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
@@ -57,7 +57,7 @@ run_benchmark_serving \
     --num-prompts $((CONC * 10)) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir "$PWD/"
 
 if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT"
diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index b9d4d90cc..5cb7c24fd 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -255,6 +255,15 @@ else
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     LOCK_FILE="${SQUASH_FILE}.lock"
 
+    # The deepseek-v4-blackwell image installs sglang editable at /workspace/sglang/python,
+    # which our usual $GITHUB_WORKSPACE:/workspace/ bind-mount would mask. Mount under /ix for
+    # this image so the in-image sglang source stays visible.
+    if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
+        CONTAINER_MOUNT_DIR=/ix
+    else
+        CONTAINER_MOUNT_DIR=/workspace
+    fi
+
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
@@ -275,9 +284,9 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+        --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
         --no-container-mount-home \
-        --container-workdir=/workspace/ \
+        --container-workdir=$CONTAINER_MOUNT_DIR \
         --no-container-entrypoint --export=ALL,PORT=8888 \
         bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
 fi

From ffd0874f9730f38d744d5c3d431b4ea1f223c7e5 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:47:20 -0500
Subject: [PATCH 06/20] fix: reinstall sglang from PyPI to work around masked
 editable install

The lmsysorg/sglang:deepseek-v4-blackwell image installs sglang editable at
/workspace/sglang/python, which our $GITHUB_WORKSPACE:/workspace/ bind-mount
masks. Temporary one-line workaround: pip install --no-deps sglang in the
benchmark script to restore a non-editable copy in site-packages. Runner
reverted to the standard /workspace mount. Marked with a TODO(Cam) for
the proper fix once lmsys publishes an image that doesn't editable-install
under /workspace.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh | 13 ++++++++++---
 runners/launch_b200-dgxc-slurm.sh       | 13 ++-----------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 598fbc77d..2f58a179b 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -21,7 +21,14 @@ nvidia-smi
 
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 
-SERVER_LOG="$PWD/server.log"
+# TODO(Cam): sloppy workaround -- the lmsysorg/sglang:deepseek-v4-blackwell image
+# installs sglang editable at /workspace/sglang/python, which the runner's
+# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Reinstalling from PyPI drops any
+# custom patches baked into the image's local sglang source. Revert once lmsys
+# ships an image that installs sglang outside /workspace (or non-editable).
+pip install --no-deps --quiet sglang
+
+SERVER_LOG=/workspace/server.log
 PORT=${PORT:-8888}
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
@@ -32,7 +39,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
 fi
 
-start_gpu_monitor --output "$PWD/gpu_metrics.csv"
+start_gpu_monitor
 
 set -x
 sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
@@ -57,7 +64,7 @@ run_benchmark_serving \
     --num-prompts $((CONC * 10)) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir "$PWD/"
+    --result-dir /workspace/
 
 if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT"
diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index 5cb7c24fd..b9d4d90cc 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -255,15 +255,6 @@ else
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     LOCK_FILE="${SQUASH_FILE}.lock"
 
-    # The deepseek-v4-blackwell image installs sglang editable at /workspace/sglang/python,
-    # which our usual $GITHUB_WORKSPACE:/workspace/ bind-mount would mask. Mount under /ix for
-    # this image so the in-image sglang source stays visible.
-    if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
-        CONTAINER_MOUNT_DIR=/ix
-    else
-        CONTAINER_MOUNT_DIR=/workspace
-    fi
-
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
@@ -284,9 +275,9 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
         --no-container-mount-home \
-        --container-workdir=$CONTAINER_MOUNT_DIR \
+        --container-workdir=/workspace/ \
         --no-container-entrypoint --export=ALL,PORT=8888 \
         bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
 fi

From fef260fa0ab843e6acd8b5dafb2cf8b6cdb8ccc2 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:53:00 -0500
Subject: [PATCH 07/20] fix: uninstall editable sglang before reinstalling from
 PyPI

'pip install --no-deps sglang' is a no-op when sglang is already
registered in site-packages -- even if the underlying editable path
is missing -- so the prior workaround never actually swapped in a
working install. Uninstall the broken egg-link first, then reinstall.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 2f58a179b..bfeb30249 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -23,9 +23,11 @@ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 
 # TODO(Cam): sloppy workaround -- the lmsysorg/sglang:deepseek-v4-blackwell image
 # installs sglang editable at /workspace/sglang/python, which the runner's
-# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Reinstalling from PyPI drops any
-# custom patches baked into the image's local sglang source. Revert once lmsys
-# ships an image that installs sglang outside /workspace (or non-editable).
+# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Uninstall the broken editable
+# link, then reinstall from PyPI (drops any custom patches baked into the
+# image's local sglang source). Revert once lmsys ships an image that installs
+# sglang outside /workspace (or non-editable).
+pip uninstall -y sglang 2>/dev/null || true
 pip install --no-deps --quiet sglang
 
 SERVER_LOG=/workspace/server.log

From da148a1637f0646dc3686ac5d7411ffdde12b04d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 01:56:49 -0500
Subject: [PATCH 08/20] fix: mount repo at /ix for deepseek-v4-blackwell; drop
 pip workaround

Back to the proper mount fix so we use the same
'PYTHONNOUSERSITE=1 python3 -m sglang.launch_server ...' invocation as
every other sglang single_node script. Conditional mount target keeps
the blast radius to this one config.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh | 23 ++++++++++-------------
 runners/launch_b200-dgxc-slurm.sh       | 15 +++++++++++++--
 2 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index bfeb30249..284ccfba3 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -21,16 +21,13 @@ nvidia-smi
 
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 
-# TODO(Cam): sloppy workaround -- the lmsysorg/sglang:deepseek-v4-blackwell image
-# installs sglang editable at /workspace/sglang/python, which the runner's
-# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Uninstall the broken editable
-# link, then reinstall from PyPI (drops any custom patches baked into the
-# image's local sglang source). Revert once lmsys ships an image that installs
-# sglang outside /workspace (or non-editable).
-pip uninstall -y sglang 2>/dev/null || true
-pip install --no-deps --quiet sglang
-
-SERVER_LOG=/workspace/server.log
+# TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang
+# editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
+# The runner mounts our repo at a non-/workspace path for this image so the editable
+# install stays visible. Paths in this script are $PWD-relative for that reason.
+# Drop the runner conditional once lmsys moves sglang back out of /workspace.
+
+SERVER_LOG="$PWD/server.log"
 PORT=${PORT:-8888}
 
 echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"
@@ -41,10 +38,10 @@ if [ "${EVAL_ONLY}" = "true" ]; then
     EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
 fi
 
-start_gpu_monitor
+start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
 set -x
-sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
+PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
 --tp $TP \
 --moe-runner-backend flashinfer_mxfp4 \
 --mem-fraction-static 0.82 \
@@ -66,7 +63,7 @@ run_benchmark_serving \
     --num-prompts $((CONC * 10)) \
     --max-concurrency "$CONC" \
     --result-filename "$RESULT_FILENAME" \
-    --result-dir /workspace/
+    --result-dir "$PWD/"
 
 if [ "${RUN_EVAL}" = "true" ]; then
     run_eval --framework lm-eval --port "$PORT"
diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh
index b9d4d90cc..c07037ff4 100644
--- a/runners/launch_b200-dgxc-slurm.sh
+++ b/runners/launch_b200-dgxc-slurm.sh
@@ -255,6 +255,17 @@ else
     SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
     LOCK_FILE="${SQUASH_FILE}.lock"
 
+    # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
+    # /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
+    # the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
+    # breaks `import sglang`. Mount this one image at /ix instead; drop the
+    # conditional once the image stops installing editable under /workspace.
+    if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
+        CONTAINER_MOUNT_DIR=/ix
+    else
+        CONTAINER_MOUNT_DIR=/workspace
+    fi
+
     salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
     JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)
 
@@ -275,9 +286,9 @@ else
 
     srun --jobid=$JOB_ID \
         --container-image=$SQUASH_FILE \
-        --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+        --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
         --no-container-mount-home \
-        --container-workdir=/workspace/ \
+        --container-workdir=$CONTAINER_MOUNT_DIR \
         --no-container-entrypoint --export=ALL,PORT=8888 \
         bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
 fi

From 95eb527ec14b4124f37689ae5ea9110c9d2bf6bb Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 02:02:46 -0500
Subject: [PATCH 09/20] fix: unset baked-in CUDA_VISIBLE_DEVICES for
 deepseek-v4-blackwell image

The image ENV pins CUDA_VISIBLE_DEVICES=4,5,6,7 (leftover from lmsys's
internal testing). With --no-container-entrypoint it isn't cleared, so
the container only sees 4 GPUs and TP=8 fails with
  torch.AcceleratorError: CUDA error: invalid device ordinal

Unset it at the top of the script so Slurm's 8-GPU allocation is visible.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 284ccfba3..449fcd936 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -21,6 +21,11 @@ nvidia-smi
 
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 
+# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV,
+# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to
+# all ranks.
+unset CUDA_VISIBLE_DEVICES
+
 # TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang
 # editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
 # The runner mounts our repo at a non-/workspace path for this image so the editable

From 9a3457ab8311dfec870e2db48fb88f4d86911f50 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 02:28:25 -0500
Subject: [PATCH 10/20] fix: apply same /ix mount fix to launch_b200-nb.sh

Only patched launch_b200-dgxc-slurm.sh last time; the b200-nb runner
still had the default $GITHUB_WORKSPACE:/workspace/ mount, which
masks the deepseek-v4-blackwell image's /workspace/sglang editable
install. Most B200 jobs in this repo run on b200-nb.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b200-nb.sh | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh
index c321ee0f9..98bd2c6c4 100644
--- a/runners/launch_b200-nb.sh
+++ b/runners/launch_b200-nb.sh
@@ -7,14 +7,25 @@ SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 UCX_NET_DEVICES=eth0
 
+# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
+# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
+# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
+# breaks `import sglang`. Mount this one image at /ix instead; drop the
+# conditional once the image stops installing editable under /workspace.
+if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
+    CONTAINER_MOUNT_DIR=/ix
+else
+    CONTAINER_MOUNT_DIR=/workspace
+fi
+
 set -x
 srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \
 --container-image=$IMAGE \
 --container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \
---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 --no-container-mount-home \
 --container-remap-root \
 --container-writable \
---container-workdir=/workspace/ \
+--container-workdir=$CONTAINER_MOUNT_DIR \
 --no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \
 bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
\ No newline at end of file

From 9779d14bf0289521a09e096503479b35ace6b6ae Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 02:30:17 -0500
Subject: [PATCH 11/20] Drop --container-name arg from launch_b200-nb.sh

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b200-nb.sh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh
index 98bd2c6c4..6b411fec2 100644
--- a/runners/launch_b200-nb.sh
+++ b/runners/launch_b200-nb.sh
@@ -21,7 +21,6 @@ fi
 set -x
 srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \
 --container-image=$IMAGE \
---container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \
 --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
 --no-container-mount-home \
 --container-remap-root \

From fe012a70e1d4ec35f3dcc1856d4db5aa97823b92 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 10:19:22 -0500
Subject: [PATCH 12/20] change runner

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 9e57fb398..9adedaade 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1673,7 +1673,7 @@ dsv4-fp4-b200-sglang:
   image: lmsysorg/sglang:deepseek-v4-blackwell
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b200
+  runner: b200-nb
   precision: fp4
   framework: sglang
   multinode: false

From 151a62fbebdf58102238ea6a9230a7290b774ac4 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 12:58:59 -0500
Subject: [PATCH 13/20] update recipe

---
 benchmarks/single_node/dsv4_fp4_b200.sh | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 449fcd936..c861536a8 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -21,11 +21,6 @@ nvidia-smi
 
 export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0
 
-# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV,
-# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to
-# all ranks.
-unset CUDA_VISIBLE_DEVICES
-
 # TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang
 # editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
 # The runner mounts our repo at a non-/workspace path for this image so the editable
@@ -46,11 +41,17 @@ fi
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
 set -x
-PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \
---tp $TP \
---moe-runner-backend flashinfer_mxfp4 \
---mem-fraction-static 0.82 \
---disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+PYTHONNOUSERSITE=1 sglang serve \
+    --model-path $MODEL \
+    --host 0.0.0.0 \
+    --port $PORT \
+    --trust-remote-code \
+    --tp $TP \
+    --moe-runner-backend flashinfer_mxfp4 \
+    --mem-fraction-static 0.82 \
+    --chunked-prefill-size 4096 \
+    --disable-flashinfer-autotune \
+    --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 

From d96a2b0307a381a82f0fd11c02fba6417e706f54 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 13:19:03 -0500
Subject: [PATCH 14/20] Fix launch_b200-cw.sh and add b200-cw to runners pool

- Correct suffix from _h200 to _b200 (copy-paste from launch_h200-cw.sh
  would have routed b200 jobs to non-existent h200 scripts).
- Apply the same /ix mount conditional for deepseek-v4-blackwell as
  the other b200 runners, so sglang's editable install at
  /workspace/sglang/python isn't masked.
- Add b200-cw_00 / b200-cw_01 to the b200 runner pool in runners.yaml.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/runners.yaml |  2 ++
 runners/launch_b200-cw.sh    | 61 ++++++++++++++++++++++++++++++++++++
 2 files changed, 63 insertions(+)
 create mode 100644 runners/launch_b200-cw.sh

diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml
index 1bf0e2a6e..693bb4561 100644
--- a/.github/configs/runners.yaml
+++ b/.github/configs/runners.yaml
@@ -50,6 +50,8 @@ h200-multinode:
 - 'h200-dgxc-slurm_12'
 - 'h200-dgxc-slurm_13'
 b200:
+- 'b200-cw_00'
+- 'b200-cw_01'
 - 'b200-nb_0'
 - 'b200-nb_1'
 - 'b200-dgxc-slurm_0'
diff --git a/runners/launch_b200-cw.sh b/runners/launch_b200-cw.sh
new file mode 100644
index 000000000..29614c9c5
--- /dev/null
+++ b/runners/launch_b200-cw.sh
@@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+
+export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunners/hf-hub-cache"
+export PORT=8888
+
+MODEL_CODE="${EXP_NAME%%_*}"
+FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
+SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
+
+PARTITION="b200"
+SQUASH_FILE="/mnt/vast/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+LOCK_FILE="${SQUASH_FILE}.lock"
+
+# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
+# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
+# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
+# breaks `import sglang`. Mount this one image at /ix instead; drop the
+# conditional once the image stops installing editable under /workspace.
+if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
+    CONTAINER_MOUNT_DIR=/ix
+else
+    CONTAINER_MOUNT_DIR=/workspace
+fi
+
+set -x
+
+JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:b200:$TP --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')
+
+if [ -z "$JOB_ID" ]; then
+    echo "ERROR: salloc failed to allocate a job"
+    exit 1
+fi
+
+# Use Docker image directly for openai/gpt-oss-120b with trt, otherwise use squash file
+if [[ "$MODEL" == "openai/gpt-oss-120b" && "$FRAMEWORK" == "trt" ]]; then
+    CONTAINER_IMAGE=$IMAGE
+else
+    # Use flock to serialize concurrent imports to the same squash file
+    srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
+        exec 9>\"$LOCK_FILE\"
+        flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
+        if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
+            echo 'Squash file already exists and is valid, skipping import'
+        else
+            rm -f \"$SQUASH_FILE\"
+            enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
+        fi
+    "
+    CONTAINER_IMAGE=$(realpath $SQUASH_FILE)
+fi
+
+srun --jobid=$JOB_ID \
+--container-image=$CONTAINER_IMAGE \
+--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
+--container-mount-home \
+--container-workdir=$CONTAINER_MOUNT_DIR \
+--no-container-entrypoint --export=ALL \
+bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
+
+rmdir $SAGEMAKER_SHM_PATH
+scancel $JOB_ID

From ffd8e474cb6c5e2b8feafbc97626818211699b6d Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 13:19:35 -0500
Subject: [PATCH 15/20] update recipe

---
 .github/configs/nvidia-master.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 9adedaade..3b21a4841 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1673,7 +1673,7 @@ dsv4-fp4-b200-sglang:
   image: lmsysorg/sglang:deepseek-v4-blackwell
   model: deepseek-ai/DeepSeek-V4-Pro
   model-prefix: dsv4
-  runner: b200-nb
+  runner: b200-dsv4
   precision: fp4
   framework: sglang
   multinode: false

From 3a354efa29465d040559c68197bad0afaafb3aac Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 13:35:46 -0500
Subject: [PATCH 16/20] update model storage to nvme

---
 runners/launch_b200-cw.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/runners/launch_b200-cw.sh b/runners/launch_b200-cw.sh
index 29614c9c5..ef0ad3528 100644
--- a/runners/launch_b200-cw.sh
+++ b/runners/launch_b200-cw.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunners/hf-hub-cache"
+export HF_HUB_CACHE_MOUNT="/tmp/gharunner/hf-hub-cache"
 export PORT=8888
 
 MODEL_CODE="${EXP_NAME%%_*}"
@@ -8,7 +8,7 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
 SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
 
 PARTITION="b200"
-SQUASH_FILE="/mnt/vast/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
+SQUASH_FILE="/tmp/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
 LOCK_FILE="${SQUASH_FILE}.lock"
 
 # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at

From a425131c3b86ff6d4990bdc80bb722e786bf4526 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 13:43:41 -0500
Subject: [PATCH 17/20] fix(launch_b200-cw): skip realpath on worker-local
 squash; drop stale rmdir
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- SQUASH_FILE lives under /tmp/gharunner/squash on the allocated worker
  node and isn't visible from the host, so realpath on the host returned
  empty and srun failed with 'Invalid --container-image argument: '.
  Pass the path straight through; srun resolves it inside the job.
- Remove the leftover 'rmdir $SAGEMAKER_SHM_PATH' — the env var isn't
  set in this cluster and rmdir fired with no operand every run.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 runners/launch_b200-cw.sh | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/runners/launch_b200-cw.sh b/runners/launch_b200-cw.sh
index ef0ad3528..ec7ba9a97 100644
--- a/runners/launch_b200-cw.sh
+++ b/runners/launch_b200-cw.sh
@@ -46,7 +46,10 @@ else
             enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
         fi
     "
-    CONTAINER_IMAGE=$(realpath $SQUASH_FILE)
+    # Squash file lives on the allocated worker node's /tmp, which is not
+    # visible from the host, so realpath on the host would return empty.
+    # Pass the path as-is; srun resolves it inside the job.
+    CONTAINER_IMAGE=$SQUASH_FILE
 fi
 
 srun --jobid=$JOB_ID \
@@ -57,5 +60,4 @@ srun --jobid=$JOB_ID \
 --no-container-entrypoint --export=ALL \
 bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
 
-rmdir $SAGEMAKER_SHM_PATH
 scancel $JOB_ID

From 103a202ce18ec68565652acdff4155516f138683 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 14:53:08 -0500
Subject: [PATCH 18/20] feat(dsv4_fp4_b200): pick recipe
 (low-latency/balanced/max-throughput) by CONC

The cookbook documents three B200 recipes for DeepSeek-V4-Pro that
differ significantly in server flags. Pick between them based on CONC:

  CONC <= 32   -> low-latency    (TP only, chunked-prefill 4096,
                                  disable-flashinfer-autotune)
  33..128      -> balanced       (+ DP-attention, max-running-reqs=128,
                                  cuda-graph-max-bs=64, deepep-config)
  CONC > 128   -> max-throughput (+ DP-attention, max-running-reqs=256,
                                  cuda-graph-max-bs=64, deepep-config)

Speculative decoding still omitted from all three per the no-spec
baseline, and --disable-radix-cache kept for no-prefix-caching.
Thresholds mirror the recipes' own max-running-requests caps.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 benchmarks/single_node/dsv4_fp4_b200.sh | 41 ++++++++++++++++++++++---
 1 file changed, 37 insertions(+), 4 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index c861536a8..1e5b737c7 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -40,6 +40,40 @@ fi
 
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
+# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
+# (spec-decoding flags dropped for the baseline):
+#   - low-latency    (CONC <= 32):        TP-only, chunked-prefill, disable autotune
+#   - balanced       (32 < CONC <= 128):  + DP-attn, max-running-requests=128
+#   - max-throughput (CONC > 128):        + DP-attn, max-running-requests=256
+DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+if [[ $CONC -le 32 ]]; then
+    RECIPE=low-latency
+    RECIPE_FLAGS=(
+        --moe-runner-backend flashinfer_mxfp4
+        --chunked-prefill-size 4096
+        --disable-flashinfer-autotune
+    )
+elif [[ $CONC -le 128 ]]; then
+    RECIPE=balanced
+    RECIPE_FLAGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --cuda-graph-max-bs 64
+        --max-running-requests 128
+        --deepep-config "$DEEPEP_CONFIG"
+    )
+else
+    RECIPE=max-throughput
+    RECIPE_FLAGS=(
+        --dp-size "$TP"
+        --enable-dp-attention
+        --cuda-graph-max-bs 64
+        --max-running-requests 256
+        --deepep-config "$DEEPEP_CONFIG"
+    )
+fi
+echo "Recipe: $RECIPE (CONC=$CONC)"
+
 set -x
 PYTHONNOUSERSITE=1 sglang serve \
     --model-path $MODEL \
@@ -47,11 +81,10 @@ PYTHONNOUSERSITE=1 sglang serve \
     --port $PORT \
     --trust-remote-code \
     --tp $TP \
-    --moe-runner-backend flashinfer_mxfp4 \
+    --moe-a2a-backend deepep \
     --mem-fraction-static 0.82 \
-    --chunked-prefill-size 4096 \
-    --disable-flashinfer-autotune \
-    --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
+    --disable-radix-cache \
+    "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 
 SERVER_PID=$!
 

From 4a96602cdc1d4ac21a24f069d2d36f196e6f7678 Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 15:17:56 -0500
Subject: [PATCH 19/20] update b200

---
 benchmarks/single_node/dsv4_fp4_b200.sh | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh
index 1e5b737c7..d455af3a3 100755
--- a/benchmarks/single_node/dsv4_fp4_b200.sh
+++ b/benchmarks/single_node/dsv4_fp4_b200.sh
@@ -41,35 +41,43 @@ fi
 start_gpu_monitor --output "$PWD/gpu_metrics.csv"
 
 # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
-# (spec-decoding flags dropped for the baseline):
+# (spec-decoding / MTP and prefix-caching flags dropped for the baseline):
 #   - low-latency    (CONC <= 32):        TP-only, chunked-prefill, disable autotune
 #   - balanced       (32 < CONC <= 128):  + DP-attn, max-running-requests=128
 #   - max-throughput (CONC > 128):        + DP-attn, max-running-requests=256
 DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'
+
 if [[ $CONC -le 32 ]]; then
     RECIPE=low-latency
     RECIPE_FLAGS=(
         --moe-runner-backend flashinfer_mxfp4
         --chunked-prefill-size 4096
         --disable-flashinfer-autotune
+        --mem-fraction-static 0.82
     )
 elif [[ $CONC -le 128 ]]; then
     RECIPE=balanced
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
     RECIPE_FLAGS=(
         --dp-size "$TP"
         --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+        --mem-fraction-static 0.82
         --cuda-graph-max-bs 64
         --max-running-requests 128
-        --deepep-config "$DEEPEP_CONFIG"
     )
 else
     RECIPE=max-throughput
+    export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
     RECIPE_FLAGS=(
         --dp-size "$TP"
         --enable-dp-attention
+        --moe-a2a-backend deepep
+        --deepep-config "$DEEPEP_CONFIG"
+        --mem-fraction-static 0.82
         --cuda-graph-max-bs 64
         --max-running-requests 256
-        --deepep-config "$DEEPEP_CONFIG"
     )
 fi
 echo "Recipe: $RECIPE (CONC=$CONC)"
@@ -81,8 +89,6 @@ PYTHONNOUSERSITE=1 sglang serve \
     --port $PORT \
     --trust-remote-code \
     --tp $TP \
-    --moe-a2a-backend deepep \
-    --mem-fraction-static 0.82 \
     --disable-radix-cache \
     "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &
 

From 43be495bdf3ef20a74c8f6b12acbb5f24d60896a Mon Sep 17 00:00:00 2001
From: Cam Quilici <cjquilici@gmail.com>
Date: Fri, 24 Apr 2026 15:28:54 -0500
Subject: [PATCH 20/20] feat(dsv4-fp4-b200-sglang): split search-space per
 sglang recipe

Split the single CONC 4..1024/512 row into three rows (low-latency /
balanced / max-throughput) matching the recipe boundaries inside
dsv4_fp4_b200.sh so result filenames carry accurate ep= and dpa= labels.
ep=8 on balanced/max-throughput reflects sglang's implicit
ep_size=tp_size override when --moe-a2a-backend deepep is set.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/configs/nvidia-master.yaml | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml
index 3b21a4841..13ef0ff2b 100644
--- a/.github/configs/nvidia-master.yaml
+++ b/.github/configs/nvidia-master.yaml
@@ -1677,15 +1677,33 @@ dsv4-fp4-b200-sglang:
   precision: fp4
   framework: sglang
   multinode: false
+  # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
+  # are selected inside benchmarks/single_node/dsv4_fp4_b200.sh by CONC:
+  #   low-latency    (CONC <= 32):       TP-only
+  #   balanced       (32 < CONC <= 128): + DP-attn
+  #   max-throughput (CONC > 128):       + DP-attn
+  # Split so result filenames (ep=, dpa=) accurately reflect the recipe.
+  # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
+  # while low-latency leaves ep_size at the default of 1.
   seq-len-configs:
   - isl: 1024
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
+    # low-latency
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
+    # balanced
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
+    # max-throughput
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
   - isl: 8192
     osl: 1024
     search-space:
-    - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }
+    # low-latency
+    - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
+    # balanced
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
+    # max-throughput
+    - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }
 
 # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4