From f9a0ed9f8614be5aabf381cb4c644d129ccaf63e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:10:08 -0500 Subject: [PATCH 01/20] Add dsv4-fp4-b200-sglang single-node config Adds the DeepSeek-V4-Flash B200 SGLang recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4. Prefix caching and speculative decoding are disabled for baseline numbers. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 18 ++++++ benchmarks/single_node/dsv4_fp4_b200.sh | 75 +++++++++++++++++++++++++ perf-changelog.yaml | 9 +++ 3 files changed, 102 insertions(+) create mode 100755 benchmarks/single_node/dsv4_fp4_b200.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 96273444f..49be01a98 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1669,6 +1669,24 @@ dsr1-fp4-b200-sglang: - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } - { tp: 8, ep: 8, conc-start: 4, conc-end: 16 } +dsv4-fp4-b200-sglang: + image: lmsysorg/sglang:deepseek-v4-blackwell + model: deepseek-ai/DeepSeek-V4-Flash + model-prefix: dsv4 + runner: b200 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } + # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4 # B200 SGLang recipe as-is until B300-specific tuning is available. diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh new file mode 100755 index 000000000..7faa661b2 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -0,0 +1,75 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + EP_SIZE + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +nvidia-smi + +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +if [[ $CONC -ge 16 ]]; then + SCHEDULER_RECV_INTERVAL=30 +else + SCHEDULER_RECV_INTERVAL=10 +fi +echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ +--tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \ +--moe-runner-backend flashinfer_mxfp4 --moe-a2a-backend deepep \ +--chunked-prefill-size 4096 --disable-flashinfer-autotune \ +--scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ +--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $((CONC * 10)) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2b2e138c8..7dd1629b8 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,3 +1,12 @@ +- config-keys: + - dsv4-fp4-b200-sglang + description: + - "Add DeepSeek-V4-Flash single-node B200 SGLang benchmark (TP4, FP4 MoE + FP8 dense)" + - "Container: lmsysorg/sglang:deepseek-v4-blackwell" + - "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + - "Prefix caching and speculative decoding disabled for baseline numbers" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD + - config-keys: - dsr1-fp8-h100-dynamo-trt - dsr1-fp8-h100-dynamo-sglang From 44a1c1f490dc19f02dd41914278c47fbfece445d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:18:56 -0500 Subject: [PATCH 02/20] Switch dsv4-fp4-b200-sglang to Pro model, match vllm parallelism Uses deepseek-ai/DeepSeek-V4-Pro with tp=8, ep=8, dp-attention enabled and sweep concurrency ranges aligned with dsv4-fp4-b200-vllm (4-1024 at 1k/1k, 4-512 at 8k/1k). Script now passes --enable-dp-attention when DP_ATTENTION=true and sets --mem-fraction-static per the Pro recipe. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 6 +++--- benchmarks/single_node/dsv4_fp4_b200.sh | 13 ++++++++++--- perf-changelog.yaml | 5 +++-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 49be01a98..3a4695665 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1671,7 +1671,7 @@ dsr1-fp4-b200-sglang: dsv4-fp4-b200-sglang: image: lmsysorg/sglang:deepseek-v4-blackwell - model: deepseek-ai/DeepSeek-V4-Flash + model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b200 precision: fp4 @@ -1681,11 +1681,11 @@ dsv4-fp4-b200-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 32 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 512 } # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4 diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 7faa661b2..c5860e868 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -10,7 +10,8 @@ check_env_vars \ OSL \ RANDOM_RANGE_RATIO \ RESULT_FILENAME \ - EP_SIZE + EP_SIZE \ + DP_ATTENTION if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -30,7 +31,12 @@ if [[ $CONC -ge 16 ]]; then else SCHEDULER_RECV_INTERVAL=10 fi -echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL, TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" + +DP_ATTN_ARGS="" +if [[ "$DP_ATTENTION" == "true" ]]; then + DP_ATTN_ARGS="--enable-dp-attention --dp-size $TP" +fi EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -42,8 +48,9 @@ start_gpu_monitor set -x PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ ---tensor-parallel-size=$TP --data-parallel-size=1 --ep-size $EP_SIZE \ +--tensor-parallel-size=$TP --ep-size $EP_SIZE $DP_ATTN_ARGS \ --moe-runner-backend flashinfer_mxfp4 --moe-a2a-backend deepep \ +--mem-fraction-static 0.82 \ --chunked-prefill-size 4096 --disable-flashinfer-autotune \ --scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 7dd1629b8..45c0c8ebf 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1,11 +1,12 @@ - config-keys: - dsv4-fp4-b200-sglang description: - - "Add DeepSeek-V4-Flash single-node B200 SGLang benchmark (TP4, FP4 MoE + FP8 dense)" + - "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark (TP8, EP8, dp-attention)" - "Container: lmsysorg/sglang:deepseek-v4-blackwell" - "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + - "Parallelism and sweep conc ranges match the dsv4-fp4-b200-vllm config" - "Prefix caching and speculative decoding disabled for baseline numbers" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/TBD + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131 - config-keys: - dsr1-fp8-h100-dynamo-trt From c21ee5cce0f9a2c6d2d223d49b0248e9cddf34e4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:22:15 -0500 Subject: [PATCH 03/20] Match DSV4 Pro SGLang recipe literally; port HF cache path Server launch now mirrors the DeepSeek-V4-Pro command from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4: --tp N, --moe-runner-backend flashinfer_mxfp4, --mem-fraction-static 0.82, SGLANG_JIT_DEEPGEMM_PRECOMPILE=0. Speculative decoding omitted and --disable-radix-cache added per the no-spec / no-prefix-cache baseline. YAML search-space drops ep/dp-attn to tp=8, ep=1. Also syncs runners/launch_b200-dgxc-slurm.sh with the HF cache mount path from origin/claude/add-dsv4-fp4-b200-vllm so both PRs stay in agreement on runner layout. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 4 ++-- benchmarks/single_node/dsv4_fp4_b200.sh | 22 ++++------------------ runners/launch_b200-dgxc-slurm.sh | 5 ++--- 3 files changed, 8 insertions(+), 23 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3a4695665..9e57fb398 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1681,11 +1681,11 @@ dsv4-fp4-b200-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 1024 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 512 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 } # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4 diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index c5860e868..0ed538599 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -9,9 +9,7 @@ check_env_vars \ ISL \ OSL \ RANDOM_RANGE_RATIO \ - RESULT_FILENAME \ - EP_SIZE \ - DP_ATTENTION + RESULT_FILENAME if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -26,17 +24,7 @@ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -if [[ $CONC -ge 16 ]]; then - SCHEDULER_RECV_INTERVAL=30 -else - SCHEDULER_RECV_INTERVAL=10 -fi -echo "SCHEDULER_RECV_INTERVAL: $SCHEDULER_RECV_INTERVAL, CONC: $CONC, ISL: $ISL, OSL: $OSL, TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION" - -DP_ATTN_ARGS="" -if [[ "$DP_ATTENTION" == "true" ]]; then - DP_ATTN_ARGS="--enable-dp-attention --dp-size $TP" -fi +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -48,11 +36,9 @@ start_gpu_monitor set -x PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ ---tensor-parallel-size=$TP --ep-size $EP_SIZE $DP_ATTN_ARGS \ ---moe-runner-backend flashinfer_mxfp4 --moe-a2a-backend deepep \ +--tp $TP \ +--moe-runner-backend flashinfer_mxfp4 \ --mem-fraction-static 0.82 \ ---chunked-prefill-size 4096 --disable-flashinfer-autotune \ ---scheduler-recv-interval $SCHEDULER_RECV_INTERVAL \ --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index c0f25310b..b9d4d90cc 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -249,8 +249,7 @@ EOF else - HF_HUB_CACHE_MOUNT="/scratch/fsw/models" - export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}" + HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache" SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') @@ -276,7 +275,7 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ From 039977307b42c6c6c67b325dfe827b022133e5fc Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:27:00 -0500 Subject: [PATCH 04/20] fix: use 'sglang serve' CLI, not python -m sglang.launch_server The deepseek-v4-blackwell image doesn't expose sglang via system python3, so the module import fails: /usr/bin/python3: Error while finding module specification for 'sglang.launch_server' (ModuleNotFoundError: No module named 'sglang') Switch to the `sglang serve` entrypoint that the cookbook uses; the CLI resolves the correct interpreter. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b200.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 0ed538599..0f443415a 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -35,7 +35,7 @@ fi start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ +sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ --tp $TP \ --moe-runner-backend flashinfer_mxfp4 \ --mem-fraction-static 0.82 \ From 4a3e3e95bd6378cdec4d0b632d58bae3a6d46a52 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:42:29 -0500 Subject: [PATCH 05/20] fix: mount repo at /ix for deepseek-v4-blackwell image MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The lmsysorg/sglang:deepseek-v4-blackwell image installs sglang editable at /workspace/sglang/python — unlike every prior sglang tag which uses /sgl-workspace/sglang. Our $GITHUB_WORKSPACE:/workspace/ bind-mount masks that directory, breaking `import sglang`. Conditionally mount at /ix for this image only and make the dsv4 benchmark script use $PWD for server/metrics/result paths so it works regardless of the mount target. All other configs still mount at /workspace. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b200.sh | 6 +++--- runners/launch_b200-dgxc-slurm.sh | 13 +++++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 0f443415a..598fbc77d 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -21,7 +21,7 @@ nvidia-smi export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 -SERVER_LOG=/workspace/server.log +SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" @@ -32,7 +32,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" fi -start_gpu_monitor +start_gpu_monitor --output "$PWD/gpu_metrics.csv" set -x sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ @@ -57,7 +57,7 @@ run_benchmark_serving \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir "$PWD/" if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index b9d4d90cc..5cb7c24fd 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -255,6 +255,15 @@ else SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') LOCK_FILE="${SQUASH_FILE}.lock" + # The deepseek-v4-blackwell image installs sglang editable at /workspace/sglang/python, + # which our usual $GITHUB_WORKSPACE:/workspace/ bind-mount would mask. Mount under /ix for + # this image so the in-image sglang source stays visible. + if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then + CONTAINER_MOUNT_DIR=/ix + else + CONTAINER_MOUNT_DIR=/workspace + fi + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) @@ -275,9 +284,9 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ - --container-workdir=/workspace/ \ + --container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh fi From ffd0874f9730f38d744d5c3d431b4ea1f223c7e5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:47:20 -0500 Subject: [PATCH 06/20] fix: reinstall sglang from PyPI to work around masked editable install The lmsysorg/sglang:deepseek-v4-blackwell image installs sglang editable at /workspace/sglang/python, which our $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Temporary one-line workaround: pip install --no-deps sglang in the benchmark script to restore a non-editable copy in site-packages. Runner reverted to the standard /workspace mount. Marked with a TODO(Cam) for the proper fix once lmsys publishes an image that doesn't editable-install under /workspace. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b200.sh | 13 ++++++++++--- runners/launch_b200-dgxc-slurm.sh | 13 ++----------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 598fbc77d..2f58a179b 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -21,7 +21,14 @@ nvidia-smi export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 -SERVER_LOG="$PWD/server.log" +# TODO(Cam): sloppy workaround -- the lmsysorg/sglang:deepseek-v4-blackwell image +# installs sglang editable at /workspace/sglang/python, which the runner's +# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Reinstalling from PyPI drops any +# custom patches baked into the image's local sglang source. Revert once lmsys +# ships an image that installs sglang outside /workspace (or non-editable). +pip install --no-deps --quiet sglang + +SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" @@ -32,7 +39,7 @@ if [ "${EVAL_ONLY}" = "true" ]; then EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" fi -start_gpu_monitor --output "$PWD/gpu_metrics.csv" +start_gpu_monitor set -x sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ @@ -57,7 +64,7 @@ run_benchmark_serving \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir "$PWD/" + --result-dir /workspace/ if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index 5cb7c24fd..b9d4d90cc 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -255,15 +255,6 @@ else SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') LOCK_FILE="${SQUASH_FILE}.lock" - # The deepseek-v4-blackwell image installs sglang editable at /workspace/sglang/python, - # which our usual $GITHUB_WORKSPACE:/workspace/ bind-mount would mask. Mount under /ix for - # this image so the in-image sglang source stays visible. - if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then - CONTAINER_MOUNT_DIR=/ix - else - CONTAINER_MOUNT_DIR=/workspace - fi - salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) @@ -284,9 +275,9 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ - --container-workdir=$CONTAINER_MOUNT_DIR \ + --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh fi From fef260fa0ab843e6acd8b5dafb2cf8b6cdb8ccc2 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:53:00 -0500 Subject: [PATCH 07/20] fix: uninstall editable sglang before reinstalling from PyPI 'pip install --no-deps sglang' is a no-op when sglang is already registered in site-packages -- even if the underlying editable path is missing -- so the prior workaround never actually swapped in a working install. Uninstall the broken egg-link first, then reinstall. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b200.sh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 2f58a179b..bfeb30249 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -23,9 +23,11 @@ export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 # TODO(Cam): sloppy workaround -- the lmsysorg/sglang:deepseek-v4-blackwell image # installs sglang editable at /workspace/sglang/python, which the runner's -# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Reinstalling from PyPI drops any -# custom patches baked into the image's local sglang source. Revert once lmsys -# ships an image that installs sglang outside /workspace (or non-editable). +# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Uninstall the broken editable +# link, then reinstall from PyPI (drops any custom patches baked into the +# image's local sglang source). Revert once lmsys ships an image that installs +# sglang outside /workspace (or non-editable). +pip uninstall -y sglang 2>/dev/null || true pip install --no-deps --quiet sglang SERVER_LOG=/workspace/server.log From da148a1637f0646dc3686ac5d7411ffdde12b04d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:56:49 -0500 Subject: [PATCH 08/20] fix: mount repo at /ix for deepseek-v4-blackwell; drop pip workaround Back to the proper mount fix so we use the same 'PYTHONNOUSERSITE=1 python3 -m sglang.launch_server ...' invocation as every other sglang single_node script. Conditional mount target keeps the blast radius to this one config. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b200.sh | 23 ++++++++++------------- runners/launch_b200-dgxc-slurm.sh | 15 +++++++++++++-- 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index bfeb30249..284ccfba3 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -21,16 +21,13 @@ nvidia-smi export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 -# TODO(Cam): sloppy workaround -- the lmsysorg/sglang:deepseek-v4-blackwell image -# installs sglang editable at /workspace/sglang/python, which the runner's -# $GITHUB_WORKSPACE:/workspace/ bind-mount masks. Uninstall the broken editable -# link, then reinstall from PyPI (drops any custom patches baked into the -# image's local sglang source). Revert once lmsys ships an image that installs -# sglang outside /workspace (or non-editable). -pip uninstall -y sglang 2>/dev/null || true -pip install --no-deps --quiet sglang - -SERVER_LOG=/workspace/server.log +# TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang +# editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. +# The runner mounts our repo at a non-/workspace path for this image so the editable +# install stays visible. Paths in this script are $PWD-relative for that reason. +# Drop the runner conditional once lmsys moves sglang back out of /workspace. + +SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" @@ -41,10 +38,10 @@ if [ "${EVAL_ONLY}" = "true" ]; then EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" fi -start_gpu_monitor +start_gpu_monitor --output "$PWD/gpu_metrics.csv" set -x -sglang serve --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ --tp $TP \ --moe-runner-backend flashinfer_mxfp4 \ --mem-fraction-static 0.82 \ @@ -66,7 +63,7 @@ run_benchmark_serving \ --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir "$PWD/" if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index b9d4d90cc..c07037ff4 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -255,6 +255,17 @@ else SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') LOCK_FILE="${SQUASH_FILE}.lock" + # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at + # /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so + # the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and + # breaks `import sglang`. Mount this one image at /ix instead; drop the + # conditional once the image stops installing editable under /workspace. + if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then + CONTAINER_MOUNT_DIR=/ix + else + CONTAINER_MOUNT_DIR=/workspace + fi + salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) @@ -275,9 +286,9 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ - --container-workdir=/workspace/ \ + --container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh fi From 95eb527ec14b4124f37689ae5ea9110c9d2bf6bb Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 02:02:46 -0500 Subject: [PATCH 09/20] fix: unset baked-in CUDA_VISIBLE_DEVICES for deepseek-v4-blackwell image The image ENV pins CUDA_VISIBLE_DEVICES=4,5,6,7 (leftover from lmsys's internal testing). With --no-container-entrypoint it isn't cleared, so the container only sees 4 GPUs and TP=8 fails with torch.AcceleratorError: CUDA error: invalid device ordinal Unset it at the top of the script so Slurm's 8-GPU allocation is visible. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b200.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 284ccfba3..449fcd936 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -21,6 +21,11 @@ nvidia-smi export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV, +# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to +# all ranks. +unset CUDA_VISIBLE_DEVICES + # TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang # editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. # The runner mounts our repo at a non-/workspace path for this image so the editable From 9a3457ab8311dfec870e2db48fb88f4d86911f50 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 02:28:25 -0500 Subject: [PATCH 10/20] fix: apply same /ix mount fix to launch_b200-nb.sh Only patched launch_b200-dgxc-slurm.sh last time; the b200-nb runner still had the default $GITHUB_WORKSPACE:/workspace/ mount, which masks the deepseek-v4-blackwell image's /workspace/sglang editable install. Most B200 jobs in this repo run on b200-nb. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b200-nb.sh | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index c321ee0f9..98bd2c6c4 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -7,14 +7,25 @@ SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') UCX_NET_DEVICES=eth0 +# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at +# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so +# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and +# breaks `import sglang`. Mount this one image at /ix instead; drop the +# conditional once the image stops installing editable under /workspace. +if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then + CONTAINER_MOUNT_DIR=/ix +else + CONTAINER_MOUNT_DIR=/workspace +fi + set -x srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \ --container-image=$IMAGE \ --container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \ ---container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ --container-remap-root \ --container-writable \ ---container-workdir=/workspace/ \ +--container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \ bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh \ No newline at end of file From 9779d14bf0289521a09e096503479b35ace6b6ae Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 02:30:17 -0500 Subject: [PATCH 11/20] Drop --container-name arg from launch_b200-nb.sh Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b200-nb.sh | 1 - 1 file changed, 1 deletion(-) diff --git a/runners/launch_b200-nb.sh b/runners/launch_b200-nb.sh index 98bd2c6c4..6b411fec2 100644 --- a/runners/launch_b200-nb.sh +++ b/runners/launch_b200-nb.sh @@ -21,7 +21,6 @@ fi set -x srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \ --container-image=$IMAGE \ ---container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \ --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ --container-remap-root \ From fe012a70e1d4ec35f3dcc1856d4db5aa97823b92 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 10:19:22 -0500 Subject: [PATCH 12/20] change runner --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9e57fb398..9adedaade 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1673,7 +1673,7 @@ dsv4-fp4-b200-sglang: image: lmsysorg/sglang:deepseek-v4-blackwell model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 - runner: b200 + runner: b200-nb precision: fp4 framework: sglang multinode: false From 151a62fbebdf58102238ea6a9230a7290b774ac4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 12:58:59 -0500 Subject: [PATCH 13/20] update recipe --- benchmarks/single_node/dsv4_fp4_b200.sh | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 449fcd936..c861536a8 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -21,11 +21,6 @@ nvidia-smi export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 -# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV, -# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to -# all ranks. -unset CUDA_VISIBLE_DEVICES - # TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang # editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. # The runner mounts our repo at a non-/workspace path for this image so the editable @@ -46,11 +41,17 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path $MODEL --host 0.0.0.0 --port $PORT --trust-remote-code \ ---tp $TP \ ---moe-runner-backend flashinfer_mxfp4 \ ---mem-fraction-static 0.82 \ ---disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & +PYTHONNOUSERSITE=1 sglang serve \ + --model-path $MODEL \ + --host 0.0.0.0 \ + --port $PORT \ + --trust-remote-code \ + --tp $TP \ + --moe-runner-backend flashinfer_mxfp4 \ + --mem-fraction-static 0.82 \ + --chunked-prefill-size 4096 \ + --disable-flashinfer-autotune \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! From d96a2b0307a381a82f0fd11c02fba6417e706f54 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 13:19:03 -0500 Subject: [PATCH 14/20] Fix launch_b200-cw.sh and add b200-cw to runners pool - Correct suffix from _h200 to _b200 (copy-paste from launch_h200-cw.sh would have routed b200 jobs to non-existent h200 scripts). - Apply the same /ix mount conditional for deepseek-v4-blackwell as the other b200 runners, so sglang's editable install at /workspace/sglang/python isn't masked. - Add b200-cw_00 / b200-cw_01 to the b200 runner pool in runners.yaml. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/runners.yaml | 2 ++ runners/launch_b200-cw.sh | 61 ++++++++++++++++++++++++++++++++++++ 2 files changed, 63 insertions(+) create mode 100644 runners/launch_b200-cw.sh diff --git a/.github/configs/runners.yaml b/.github/configs/runners.yaml index 1bf0e2a6e..693bb4561 100644 --- a/.github/configs/runners.yaml +++ b/.github/configs/runners.yaml @@ -50,6 +50,8 @@ h200-multinode: - 'h200-dgxc-slurm_12' - 'h200-dgxc-slurm_13' b200: +- 'b200-cw_00' +- 'b200-cw_01' - 'b200-nb_0' - 'b200-nb_1' - 'b200-dgxc-slurm_0' diff --git a/runners/launch_b200-cw.sh b/runners/launch_b200-cw.sh new file mode 100644 index 000000000..29614c9c5 --- /dev/null +++ b/runners/launch_b200-cw.sh @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunners/hf-hub-cache" +export PORT=8888 + +MODEL_CODE="${EXP_NAME%%_*}" +FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') +SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + +PARTITION="b200" +SQUASH_FILE="/mnt/vast/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +LOCK_FILE="${SQUASH_FILE}.lock" + +# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at +# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so +# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and +# breaks `import sglang`. Mount this one image at /ix instead; drop the +# conditional once the image stops installing editable under /workspace. +if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then + CONTAINER_MOUNT_DIR=/ix +else + CONTAINER_MOUNT_DIR=/workspace +fi + +set -x + +JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:b200:$TP --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+') + +if [ -z "$JOB_ID" ]; then + echo "ERROR: salloc failed to allocate a job" + exit 1 +fi + +# Use Docker image directly for openai/gpt-oss-120b with trt, otherwise use squash file +if [[ "$MODEL" == "openai/gpt-oss-120b" && "$FRAMEWORK" == "trt" ]]; then + CONTAINER_IMAGE=$IMAGE +else + # Use flock to serialize concurrent imports to the same squash file + srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c " + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " + CONTAINER_IMAGE=$(realpath $SQUASH_FILE) +fi + +srun --jobid=$JOB_ID \ +--container-image=$CONTAINER_IMAGE \ +--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ +--container-mount-home \ +--container-workdir=$CONTAINER_MOUNT_DIR \ +--no-container-entrypoint --export=ALL \ +bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh + +rmdir $SAGEMAKER_SHM_PATH +scancel $JOB_ID From ffd8e474cb6c5e2b8feafbc97626818211699b6d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 13:19:35 -0500 Subject: [PATCH 15/20] update recipe --- .github/configs/nvidia-master.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 9adedaade..3b21a4841 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1673,7 +1673,7 @@ dsv4-fp4-b200-sglang: image: lmsysorg/sglang:deepseek-v4-blackwell model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 - runner: b200-nb + runner: b200-dsv4 precision: fp4 framework: sglang multinode: false From 3a354efa29465d040559c68197bad0afaafb3aac Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 13:35:46 -0500 Subject: [PATCH 16/20] update model storage to nvme --- runners/launch_b200-cw.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/runners/launch_b200-cw.sh b/runners/launch_b200-cw.sh index 29614c9c5..ef0ad3528 100644 --- a/runners/launch_b200-cw.sh +++ b/runners/launch_b200-cw.sh @@ -1,6 +1,6 @@ #!/usr/bin/env bash -export HF_HUB_CACHE_MOUNT="/mnt/vast/gharunners/hf-hub-cache" +export HF_HUB_CACHE_MOUNT="/tmp/gharunner/hf-hub-cache" export PORT=8888 MODEL_CODE="${EXP_NAME%%_*}" @@ -8,7 +8,7 @@ FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') PARTITION="b200" -SQUASH_FILE="/mnt/vast/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +SQUASH_FILE="/tmp/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" LOCK_FILE="${SQUASH_FILE}.lock" # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at From a425131c3b86ff6d4990bdc80bb722e786bf4526 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 13:43:41 -0500 Subject: [PATCH 17/20] fix(launch_b200-cw): skip realpath on worker-local squash; drop stale rmdir MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - SQUASH_FILE lives under /tmp/gharunner/squash on the allocated worker node and isn't visible from the host, so realpath on the host returned empty and srun failed with 'Invalid --container-image argument: '. Pass the path straight through; srun resolves it inside the job. - Remove the leftover 'rmdir $SAGEMAKER_SHM_PATH' — the env var isn't set in this cluster and rmdir fired with no operand every run. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b200-cw.sh | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/runners/launch_b200-cw.sh b/runners/launch_b200-cw.sh index ef0ad3528..ec7ba9a97 100644 --- a/runners/launch_b200-cw.sh +++ b/runners/launch_b200-cw.sh @@ -46,7 +46,10 @@ else enroot import -o \"$SQUASH_FILE\" docker://$IMAGE fi " - CONTAINER_IMAGE=$(realpath $SQUASH_FILE) + # Squash file lives on the allocated worker node's /tmp, which is not + # visible from the host, so realpath on the host would return empty. + # Pass the path as-is; srun resolves it inside the job. + CONTAINER_IMAGE=$SQUASH_FILE fi srun --jobid=$JOB_ID \ @@ -57,5 +60,4 @@ srun --jobid=$JOB_ID \ --no-container-entrypoint --export=ALL \ bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh -rmdir $SAGEMAKER_SHM_PATH scancel $JOB_ID From 103a202ce18ec68565652acdff4155516f138683 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 14:53:08 -0500 Subject: [PATCH 18/20] feat(dsv4_fp4_b200): pick recipe (low-latency/balanced/max-throughput) by CONC The cookbook documents three B200 recipes for DeepSeek-V4-Pro that differ significantly in server flags. Pick between them based on CONC: CONC <= 32 -> low-latency (TP only, chunked-prefill 4096, disable-flashinfer-autotune) 33..128 -> balanced (+ DP-attention, max-running-reqs=128, cuda-graph-max-bs=64, deepep-config) CONC > 128 -> max-throughput (+ DP-attention, max-running-reqs=256, cuda-graph-max-bs=64, deepep-config) Speculative decoding still omitted from all three per the no-spec baseline, and --disable-radix-cache kept for no-prefix-caching. Thresholds mirror the recipes' own max-running-requests caps. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b200.sh | 41 ++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index c861536a8..1e5b737c7 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -40,6 +40,40 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" +# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# (spec-decoding flags dropped for the baseline): +# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune +# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 +# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 +DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' +if [[ $CONC -le 32 ]]; then + RECIPE=low-latency + RECIPE_FLAGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 4096 + --disable-flashinfer-autotune + ) +elif [[ $CONC -le 128 ]]; then + RECIPE=balanced + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --cuda-graph-max-bs 64 + --max-running-requests 128 + --deepep-config "$DEEPEP_CONFIG" + ) +else + RECIPE=max-throughput + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --cuda-graph-max-bs 64 + --max-running-requests 256 + --deepep-config "$DEEPEP_CONFIG" + ) +fi +echo "Recipe: $RECIPE (CONC=$CONC)" + set -x PYTHONNOUSERSITE=1 sglang serve \ --model-path $MODEL \ @@ -47,11 +81,10 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --moe-runner-backend flashinfer_mxfp4 \ + --moe-a2a-backend deepep \ --mem-fraction-static 0.82 \ - --chunked-prefill-size 4096 \ - --disable-flashinfer-autotune \ - --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --disable-radix-cache \ + "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! From 4a96602cdc1d4ac21a24f069d2d36f196e6f7678 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 15:17:56 -0500 Subject: [PATCH 19/20] update b200 --- benchmarks/single_node/dsv4_fp4_b200.sh | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b200.sh b/benchmarks/single_node/dsv4_fp4_b200.sh index 1e5b737c7..d455af3a3 100755 --- a/benchmarks/single_node/dsv4_fp4_b200.sh +++ b/benchmarks/single_node/dsv4_fp4_b200.sh @@ -41,35 +41,43 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# (spec-decoding flags dropped for the baseline): +# (spec-decoding / MTP and prefix-caching flags dropped for the baseline): # - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune # - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 # - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + if [[ $CONC -le 32 ]]; then RECIPE=low-latency RECIPE_FLAGS=( --moe-runner-backend flashinfer_mxfp4 --chunked-prefill-size 4096 --disable-flashinfer-autotune + --mem-fraction-static 0.82 ) elif [[ $CONC -le 128 ]]; then RECIPE=balanced + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 RECIPE_FLAGS=( --dp-size "$TP" --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 --cuda-graph-max-bs 64 --max-running-requests 128 - --deepep-config "$DEEPEP_CONFIG" ) else RECIPE=max-throughput + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 RECIPE_FLAGS=( --dp-size "$TP" --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 --cuda-graph-max-bs 64 --max-running-requests 256 - --deepep-config "$DEEPEP_CONFIG" ) fi echo "Recipe: $RECIPE (CONC=$CONC)" @@ -81,8 +89,6 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --moe-a2a-backend deepep \ - --mem-fraction-static 0.82 \ --disable-radix-cache \ "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & From 43be495bdf3ef20a74c8f6b12acbb5f24d60896a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 15:28:54 -0500 Subject: [PATCH 20/20] feat(dsv4-fp4-b200-sglang): split search-space per sglang recipe Split the single CONC 4..1024/512 row into three rows (low-latency / balanced / max-throughput) matching the recipe boundaries inside dsv4_fp4_b200.sh so result filenames carry accurate ep= and dpa= labels. ep=8 on balanced/max-throughput reflects sglang's implicit ep_size=tp_size override when --moe-a2a-backend deepep is set. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 3b21a4841..13ef0ff2b 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1677,15 +1677,33 @@ dsv4-fp4-b200-sglang: precision: fp4 framework: sglang multinode: false + # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 + # are selected inside benchmarks/single_node/dsv4_fp4_b200.sh by CONC: + # low-latency (CONC <= 32): TP-only + # balanced (32 < CONC <= 128): + DP-attn + # max-throughput (CONC > 128): + DP-attn + # Split so result filenames (ep=, dpa=) accurately reflect the recipe. + # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, + # while low-latency leaves ep_size at the default of 1. seq-len-configs: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 } + # low-latency + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + # balanced + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } + # max-throughput + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 } + # low-latency + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + # balanced + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } + # max-throughput + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } # NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1 # does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4