From 26e540d9e81ee12dc1fc9505b6e33b93b7d2f374 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:14:38 -0500 Subject: [PATCH 01/24] feat: add DeepSeek-V4-Flash FP4 B300 SGLang benchmark Adds dsv4-fp4-b300-sglang config, single-node benchmark script, and perf-changelog entry for the DeepSeek-V4 recipe from the SGLang cookbook. The cookbook ships a B200 (not B300) recipe, so this reuses the B200 Flash Low-Latency recipe on B300 until a B300-specific recipe lands. Speculative decoding (EAGLE) and prefix caching are disabled per request. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 22 +++++++ benchmarks/single_node/dsv4_fp4_b300.sh | 76 +++++++++++++++++++++++++ perf-changelog.yaml | 13 ++++- 3 files changed, 110 insertions(+), 1 deletion(-) create mode 100755 benchmarks/single_node/dsv4_fp4_b300.sh diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ec9cbc11e..a7dcdb20f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1796,6 +1796,28 @@ dsr1-fp8-b300-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } +# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# lists B200 (not B300) as the Blackwell target. This config reuses the +# B200 Flash FP4 Low-Latency recipe on B300 until a B300-specific recipe +# ships. Speculative decoding (EAGLE) and prefix caching are disabled. +dsv4-fp4-b300-sglang: + image: lmsysorg/sglang:deepseek-v4-blackwell + model: deepseek-ai/DeepSeek-V4-Flash + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e model: Qwen/Qwen3.5-397B-A17B diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh new file mode 100755 index 000000000..dc0244f36 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b300.sh @@ -0,0 +1,76 @@ +#!/usr/bin/env bash + +# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# only ships a B200 recipe for Blackwell. This script reuses the B200 Flash +# FP4 Low-Latency recipe as-is on B300 until a B300-specific recipe ships. +# Speculative decoding (EAGLE) and prefix caching are disabled per request. + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME \ + EP_SIZE + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +hf download "$MODEL" + +nvidia-smi + +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 + +SERVER_LOG=/workspace/server.log +PORT=${PORT:-8888} + +echo "TP: $TP, EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor + +set -x +PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ +--trust-remote-code \ +--tensor-parallel-size=$TP --ep-size $EP_SIZE \ +--moe-runner-backend flashinfer_mxfp4 \ +--chunked-prefill-size 4096 \ +--disable-flashinfer-autotune \ +--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts "$((CONC * 10))" \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir /workspace/ + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index ddc6409c2..41c5c080d 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1725,7 +1725,7 @@ - "Add Qwen3.5-397B-A17B FP8 MI355X ATOM benchmark configs with and without MTP" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1040 -- config-keys: +- config-keys: - glm5.1-fp4-mi355x-atom description: - "Add GLM-5.1 MXFP4 single-node MI355X ATOM benchmark" @@ -1733,3 +1733,14 @@ - "TP=2 and TP=4, concurrency 4-256 for 1k1k and 8k1k sequence lengths" - "Add --max-num-seqs and --gpu-memory-utilization 0.9 to server launch" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1043 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Add DeepSeek-V4-Flash FP4 B300 SGLang benchmark" + - "Image: lmsysorg/sglang:deepseek-v4-blackwell" + - "Model: deepseek-ai/DeepSeek-V4-Flash (FP4 MoE experts + FP8 attention/dense)" + - "Reuses the B200 Flash Low-Latency recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 on B300 until a B300-specific recipe ships" + - "Speculative decoding (EAGLE) and prefix caching disabled" + - "TP=4/EP=4, concurrency 4-128 for 1k1k and 8k1k" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX From efdc8ba8622e09207d4487423f478a70cb367bbc Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:18:57 -0500 Subject: [PATCH 02/24] fix: switch dsv4-fp4-b300-sglang to Pro + Max-Throughput recipe Match parallelism (TP=8/EP=8/dp-attn=true) and concurrency ranges (4-1024 for 1k1k, 4-512 for 8k1k) to dsv4-fp4-b200-vllm. Use the DeepSeek-V4-Pro variant with the cookbook Max-Throughput recipe (DP=8 + DeepEP, no MTP), which aligns with the requested no-spec parallelism. Prefix caching remains disabled. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 11 +++++----- benchmarks/single_node/dsv4_fp4_b300.sh | 28 +++++++++++++++++-------- perf-changelog.yaml | 12 +++++------ 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index a7dcdb20f..458c4c928 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1798,11 +1798,12 @@ dsr1-fp8-b300-sglang: # NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 # lists B200 (not B300) as the Blackwell target. This config reuses the -# B200 Flash FP4 Low-Latency recipe on B300 until a B300-specific recipe -# ships. Speculative decoding (EAGLE) and prefix caching are disabled. +# B200 Pro FP4 Max-Throughput recipe (DP=8 + DeepEP, no MTP) on B300 +# until a B300-specific recipe ships. Prefix caching is disabled. +# Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: image: lmsysorg/sglang:deepseek-v4-blackwell - model: deepseek-ai/DeepSeek-V4-Flash + model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 precision: fp4 @@ -1812,11 +1813,11 @@ dsv4-fp4-b300-sglang: - isl: 1024 osl: 1024 search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 4, ep: 4, conc-start: 4, conc-end: 128 } + - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 512 } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh index dc0244f36..89b87ac24 100755 --- a/benchmarks/single_node/dsv4_fp4_b300.sh +++ b/benchmarks/single_node/dsv4_fp4_b300.sh @@ -1,9 +1,10 @@ #!/usr/bin/env bash # NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# only ships a B200 recipe for Blackwell. This script reuses the B200 Flash -# FP4 Low-Latency recipe as-is on B300 until a B300-specific recipe ships. -# Speculative decoding (EAGLE) and prefix caching are disabled per request. +# only ships a B200 recipe for Blackwell. This script reuses the B200 +# DeepSeek-V4-Pro Max-Throughput recipe (DP=8 + DeepEP, no MTP) as-is on +# B300 until a B300-specific recipe ships. Parallelism and concurrency +# ranges mirror dsv4-fp4-b200-vllm. Prefix caching is disabled. source "$(dirname "$0")/../benchmark_lib.sh" @@ -15,7 +16,8 @@ check_env_vars \ OSL \ RANDOM_RANGE_RATIO \ RESULT_FILENAME \ - EP_SIZE + EP_SIZE \ + DP_ATTENTION if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -26,11 +28,17 @@ hf download "$MODEL" nvidia-smi export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 SERVER_LOG=/workspace/server.log PORT=${PORT:-8888} -echo "TP: $TP, EP_SIZE: $EP_SIZE, CONC: $CONC, ISL: $ISL, OSL: $OSL" +echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +DP_ATTN_ARGS="" +if [ "$DP_ATTENTION" = "true" ]; then + DP_ATTN_ARGS="--data-parallel-size $TP --enable-dp-attention" +fi EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -43,10 +51,12 @@ start_gpu_monitor set -x PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ --trust-remote-code \ ---tensor-parallel-size=$TP --ep-size $EP_SIZE \ ---moe-runner-backend flashinfer_mxfp4 \ ---chunked-prefill-size 4096 \ ---disable-flashinfer-autotune \ +--tensor-parallel-size=$TP --ep-size $EP_SIZE $DP_ATTN_ARGS \ +--moe-a2a-backend deepep \ +--deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' \ +--mem-fraction-static 0.82 \ +--cuda-graph-max-bs 64 \ +--max-running-requests 256 \ --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 41c5c080d..bc8c1bffe 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1737,10 +1737,10 @@ - config-keys: - dsv4-fp4-b300-sglang description: - - "Add DeepSeek-V4-Flash FP4 B300 SGLang benchmark" + - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark" - "Image: lmsysorg/sglang:deepseek-v4-blackwell" - - "Model: deepseek-ai/DeepSeek-V4-Flash (FP4 MoE experts + FP8 attention/dense)" - - "Reuses the B200 Flash Low-Latency recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 on B300 until a B300-specific recipe ships" - - "Speculative decoding (EAGLE) and prefix caching disabled" - - "TP=4/EP=4, concurrency 4-128 for 1k1k and 8k1k" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX + - "Model: deepseek-ai/DeepSeek-V4-Pro (FP4 MoE experts + FP8 attention/dense)" + - "Reuses the B200 Pro Max-Throughput recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 on B300 until a B300-specific recipe ships" + - "DP=8 + DeepEP, prefix caching disabled, no speculative decoding" + - "Parallelism (TP=8/EP=8/dp-attn=true) and concurrency ranges (4-1024 for 1k1k, 4-512 for 8k1k) mirror dsv4-fp4-b200-vllm" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132 From cc35a12e0ede9bae596aa45d6a1ff4009d46f10f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:22:09 -0500 Subject: [PATCH 03/24] chore: sync launch_b200-dgxc-slurm.sh cache mount from claude/add-dsv4-fp4-b200-vllm Port the HF cache mount rework from the DSV4 B200 VLLM branch so both PRs stay consistent: use the shared /scratch/fsw/gharunners/hf-hub-cache path, drop the local MODEL override, and mount onto \$HF_HUB_CACHE inside the container. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b200-dgxc-slurm.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index c0f25310b..b9d4d90cc 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -249,8 +249,7 @@ EOF else - HF_HUB_CACHE_MOUNT="/scratch/fsw/models" - export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}" + HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache" SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') @@ -276,7 +275,7 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ From 404a097a6d5c3b28e1e89309fe2ddb2e48d60f87 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:23:12 -0500 Subject: [PATCH 04/24] fix: restore trailing whitespace stripped from glm5.1 changelog entry The dsv4-fp4-b300-sglang entry was appended correctly, but the earlier edit also stripped trailing spaces on an existing line, producing a spurious deletion. Revert so the diff is additive-only. Co-Authored-By: Claude Opus 4.7 (1M context) --- perf-changelog.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index bc8c1bffe..5b00b2f3e 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1725,7 +1725,7 @@ - "Add Qwen3.5-397B-A17B FP8 MI355X ATOM benchmark configs with and without MTP" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1040 -- config-keys: +- config-keys: - glm5.1-fp4-mi355x-atom description: - "Add GLM-5.1 MXFP4 single-node MI355X ATOM benchmark" From 97a488e978b5e3b787df04ccdf35a0a4622dfd43 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:29:10 -0500 Subject: [PATCH 05/24] chore: add flock-guarded squash import to B300 runner Mirror the lockfile logic already in launch_b200-dgxc-slurm.sh and launch_h200-dgxc-slurm.sh: serialize concurrent enroot imports of the same squash file via flock, skip the import when the squash is already valid, and override ENROOT_CACHE_PATH to avoid permission issues with the system-wide cache on worker nodes. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b300-nv.sh | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index b49391a3c..1d8bd59b4 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -258,13 +258,27 @@ else SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + LOCK_FILE="${SQUASH_FILE}.lock" # Pin to one of the known-good B300 nodes; others have hardware/network # issues that cause benchmarks to hang or fail to start. salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) - srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" + # Use flock to serialize concurrent imports to the same squash file + # Override ENROOT_CACHE_PATH to avoid permission issues with system-wide cache on worker nodes + srun --jobid=$JOB_ID bash -c " + export ENROOT_CACHE_PATH=\$HOME/.cache/enroot + mkdir -p \$ENROOT_CACHE_PATH + exec 9>\"$LOCK_FILE\" + flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } + if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then + echo 'Squash file already exists and is valid, skipping import' + else + rm -f \"$SQUASH_FILE\" + enroot import -o \"$SQUASH_FILE\" docker://$IMAGE + fi + " srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ From 106deeaae8d594789b4467429b9e0edd2effbc2f Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 01:45:45 -0500 Subject: [PATCH 06/24] fix: drop ENROOT_CACHE_PATH override from B300 runner The override ("avoid permission issues with system-wide cache on worker nodes") is a dgxc-slurm-specific workaround; launch_b300-nv.sh is on the NV slurm cluster, not dgxc-slurm. Copying it in caused the benchmark srun's pyxis shadow hook to fail with 'mkdir: cannot create directory pyxis_$JOBID.1/data: File exists'. Keep the flock + skip-if-valid logic. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b300-nv.sh | 3 --- 1 file changed, 3 deletions(-) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 1d8bd59b4..51596b7b7 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -266,10 +266,7 @@ else JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) # Use flock to serialize concurrent imports to the same squash file - # Override ENROOT_CACHE_PATH to avoid permission issues with system-wide cache on worker nodes srun --jobid=$JOB_ID bash -c " - export ENROOT_CACHE_PATH=\$HOME/.cache/enroot - mkdir -p \$ENROOT_CACHE_PATH exec 9>\"$LOCK_FILE\" flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then From 4bb1f1ae599abf76cd954dbd0b0611d7caf4609a Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 02:07:39 -0500 Subject: [PATCH 07/24] chore: point B300 runner at shared gharunners/{squash,hf-hub-cache} Move the squash cache from /data/squash to /data/home/sa-shared/gharunners/squash, and the HF cache mount from /scratch/models to /data/home/sa-shared/gharunners/hf-hub-cache. Also mount the host HF cache onto \$HF_HUB_CACHE inside the container so tools reading the default HF path pick it up (matches the B200 dgxc-slurm runner). Drop the /scratch/models Qwen3.5 path override since that path is no longer used. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b300-nv.sh | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 51596b7b7..ecb24b1a1 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -248,14 +248,8 @@ find . -name '.nfs*' -delete 2>/dev/null || true else - HF_HUB_CACHE_MOUNT="/scratch/models" - # Qwen3.5-397B-A17B-FP8 is pre-staged under /scratch/models on the B300 cluster, - # so point MODEL at the local copy. Other models fall through and use `hf download` - # against the mounted cache from their benchmark script. - if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then - export MODEL="/scratch/models/${MODEL#*/}" - fi - SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + HF_HUB_CACHE_MOUNT="/data/home/sa-shared/gharunners/hf-hub-cache" + SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') LOCK_FILE="${SQUASH_FILE}.lock" @@ -279,7 +273,7 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ From 744c5a0e3df14f0a9bc3b204cef787a4d9d58fb4 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 02:24:50 -0500 Subject: [PATCH 08/24] fix: move enroot import out of srun to avoid pyxis namespace collision Running two srun steps in the same allocation (flock+import, then the benchmark --container-image srun) reproducibly fails on this cluster with: error: pyxis: mkdir: cannot create directory '/scratch/data/user-$UID/pyxis_$JOBID.1/data': File exists error: pyxis: [ERROR] /etc/enroot/hooks.d/10-shadow.sh exited with return code 1 Per NVIDIA/pyxis#138, two srun steps sharing an allocation can leave enroot/pyxis state between steps. Collapsing to a single srun (the benchmark) is the cleanest workaround. Move the flock-guarded enroot import to the host side, before salloc. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b300-nv.sh | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index ecb24b1a1..f58d38abc 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -254,23 +254,29 @@ else SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') LOCK_FILE="${SQUASH_FILE}.lock" + # Import the squash file on the host (outside SLURM) rather than inside an + # srun step. Running two srun steps (one import, one benchmark) in the same + # allocation trips a pyxis namespace collision on this cluster: + # error: pyxis: mkdir: cannot create directory + # '/scratch/data/user-$UID/pyxis_$JOBID.1/data': File exists + # Collapsing to a single srun (the benchmark) avoids it entirely. flock + # serializes concurrent imports of the same squash by parallel GH jobs. + ( + exec 9>"$LOCK_FILE" + flock -w 600 9 || { echo "Failed to acquire lock for $SQUASH_FILE" >&2; exit 1; } + if unsquashfs -l "$SQUASH_FILE" > /dev/null 2>&1; then + echo "Squash file already exists and is valid, skipping import" + else + rm -f "$SQUASH_FILE" + enroot import -o "$SQUASH_FILE" "docker://$IMAGE" + fi + ) + # Pin to one of the known-good B300 nodes; others have hardware/network # issues that cause benchmarks to hang or fail to start. salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) - # Use flock to serialize concurrent imports to the same squash file - srun --jobid=$JOB_ID bash -c " - exec 9>\"$LOCK_FILE\" - flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; } - if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then - echo 'Squash file already exists and is valid, skipping import' - else - rm -f \"$SQUASH_FILE\" - enroot import -o \"$SQUASH_FILE\" docker://$IMAGE - fi - " - srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ From d003c59c5917175266dc0b30cf45d904d17800a7 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 02:34:34 -0500 Subject: [PATCH 09/24] fix: wipe stale pyxis scratch dirs for this JOB_ID before benchmark srun Even with a single srun step, pyxis fails with error: pyxis: mkdir: cannot create directory '/scratch/data/user-$UID/pyxis_$JOBID.0/data': File exists on fresh SLURM JOB_IDs. The /scratch path is left behind by previous jobs whose IDs SLURM later reuses (and the cluster's pyxis epilog doesn't clean it up). Wipe pyxis_$JOBID.* from the host after salloc; no-op if /scratch is node-local, effective if it's shared NFS. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b300-nv.sh | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index f58d38abc..dbd4eae72 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -277,6 +277,14 @@ else salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) + # Stale pyxis scratch from prior jobs with reused SLURM job IDs breaks the + # next container srun with + # error: pyxis: mkdir: cannot create directory + # '/scratch/data/user-$UID/pyxis_$JOBID.0/data': File exists + # If /scratch is shared across b300 nodes this cleanup works; if it's + # node-local it's a harmless no-op. + rm -rf "/scratch/data/user-$(id -u)/pyxis_${JOB_ID}."* 2>/dev/null || true + srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ From f00629fa68a602ceab888fe1407957bbd005b6b3 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 02:44:07 -0500 Subject: [PATCH 10/24] Revert: drop all B300 runner changes, mirror #1128's approach PR #1128 (dsv4-fp4fp8-b300-vllm) runs on the same cluster with ZERO changes to launch_b300-nv.sh. The pyxis 10-shadow.sh failures we were chasing aren't caused by the runner -- reset it to origin/main and keep the sglang config/bench additions only. Reverts (from this branch): - 4bb1f1ae point B300 runner at shared gharunners/{squash,hf-hub-cache} - 106deeaa drop ENROOT_CACHE_PATH override - 97a488e9 add flock-guarded squash import - 744c5a0e move enroot import out of srun - d003c59c wipe stale pyxis scratch before benchmark srun Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b300-nv.sh | 39 ++++++++++----------------------------- 1 file changed, 10 insertions(+), 29 deletions(-) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index dbd4eae72..b49391a3c 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -248,46 +248,27 @@ find . -name '.nfs*' -delete 2>/dev/null || true else - HF_HUB_CACHE_MOUNT="/data/home/sa-shared/gharunners/hf-hub-cache" - SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + HF_HUB_CACHE_MOUNT="/scratch/models" + # Qwen3.5-397B-A17B-FP8 is pre-staged under /scratch/models on the B300 cluster, + # so point MODEL at the local copy. Other models fall through and use `hf download` + # against the mounted cache from their benchmark script. + if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then + export MODEL="/scratch/models/${MODEL#*/}" + fi + SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') - LOCK_FILE="${SQUASH_FILE}.lock" - - # Import the squash file on the host (outside SLURM) rather than inside an - # srun step. Running two srun steps (one import, one benchmark) in the same - # allocation trips a pyxis namespace collision on this cluster: - # error: pyxis: mkdir: cannot create directory - # '/scratch/data/user-$UID/pyxis_$JOBID.1/data': File exists - # Collapsing to a single srun (the benchmark) avoids it entirely. flock - # serializes concurrent imports of the same squash by parallel GH jobs. - ( - exec 9>"$LOCK_FILE" - flock -w 600 9 || { echo "Failed to acquire lock for $SQUASH_FILE" >&2; exit 1; } - if unsquashfs -l "$SQUASH_FILE" > /dev/null 2>&1; then - echo "Squash file already exists and is valid, skipping import" - else - rm -f "$SQUASH_FILE" - enroot import -o "$SQUASH_FILE" "docker://$IMAGE" - fi - ) # Pin to one of the known-good B300 nodes; others have hardware/network # issues that cause benchmarks to hang or fail to start. salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) - # Stale pyxis scratch from prior jobs with reused SLURM job IDs breaks the - # next container srun with - # error: pyxis: mkdir: cannot create directory - # '/scratch/data/user-$UID/pyxis_$JOBID.0/data': File exists - # If /scratch is shared across b300 nodes this cleanup works; if it's - # node-local it's a harmless no-op. - rm -rf "/scratch/data/user-$(id -u)/pyxis_${JOB_ID}."* 2>/dev/null || true + srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ From 570b0ebcc39a003750edb1bc9c46512f1ab148bf Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 09:57:17 -0500 Subject: [PATCH 11/24] runner: add head-node flock-guarded squash import on B300 Move enroot import out of srun to the head node and serialize parallel GH jobs with flock on the shared squash file. Skips the import when a valid squash already exists. The benchmark srun is now the only step in the allocation. Co-Authored-By: Claude Opus 4.7 (1M context) --- runners/launch_b300-nv.sh | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index b49391a3c..0dbc15d17 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -255,17 +255,30 @@ else if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then export MODEL="/scratch/models/${MODEL#*/}" fi - SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + LOCK_FILE="${SQUASH_FILE}.lock" + + # Import the squash file on the head node (outside any srun) under flock. + # Parallel GH jobs target the same shared squash path; flock serializes + # imports so only one job pulls and writes the file while the rest wait. + ( + exec 9>"$LOCK_FILE" + flock -w 600 9 || { echo "Failed to acquire lock for $SQUASH_FILE" >&2; exit 1; } + if unsquashfs -l "$SQUASH_FILE" > /dev/null 2>&1; then + echo "Squash file already exists and is valid, skipping import" + else + rm -f "$SQUASH_FILE" + enroot import -o "$SQUASH_FILE" "docker://$IMAGE" + fi + ) # Pin to one of the known-good B300 nodes; others have hardware/network # issues that cause benchmarks to hang or fail to start. salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) - srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ From 864419d8b3c06ec31e2603db64ef68955acdb3ea Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 10:35:31 -0500 Subject: [PATCH 12/24] fix: mount at /ix and clear baked-in CUDA_VISIBLE_DEVICES Port the B200 branch's fix for the lmsysorg/sglang:deepseek-v4-blackwell image on B300: - The image installs sglang editable under /workspace/sglang; the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and breaks 'import sglang'. For this image, mount at /ix instead. - The image's ENV bakes CUDA_VISIBLE_DEVICES=4,5,6,7, masking half the GPUs Slurm allocates. Unset it in the bench script so TP=8 sees all 8. - Write artefacts under $PWD instead of hard-coded /workspace. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b300.sh | 12 ++++++++++-- runners/launch_b300-nv.sh | 15 +++++++++++++-- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh index 89b87ac24..90c0e681a 100755 --- a/benchmarks/single_node/dsv4_fp4_b300.sh +++ b/benchmarks/single_node/dsv4_fp4_b300.sh @@ -30,7 +30,15 @@ nvidia-smi export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 -SERVER_LOG=/workspace/server.log +# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV, +# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to +# all ranks. +unset CUDA_VISIBLE_DEVICES + +# The runner mounts this repo at a non-/workspace path for the deepseek-v4-blackwell +# image (it installs sglang editable under /workspace/sglang, which our bind-mount +# would hide), so write artefacts relative to $PWD instead of a hard-coded /workspace. +SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" @@ -75,7 +83,7 @@ run_benchmark_serving \ --num-prompts "$((CONC * 10))" \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ + --result-dir "$PWD/" if [ "${RUN_EVAL}" = "true" ]; then run_eval --framework lm-eval --port "$PORT" diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 0dbc15d17..8ce5481ba 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -260,6 +260,17 @@ else SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') LOCK_FILE="${SQUASH_FILE}.lock" + # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at + # /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so + # the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and + # breaks `import sglang`. Mount this one image at /ix instead; drop the + # conditional once the image stops installing editable under /workspace. + if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then + CONTAINER_MOUNT_DIR=/ix + else + CONTAINER_MOUNT_DIR=/workspace + fi + # Import the squash file on the head node (outside any srun) under flock. # Parallel GH jobs target the same shared squash path; flock serializes # imports so only one job pulls and writes the file while the rest wait. @@ -281,9 +292,9 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ + --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ --no-container-mount-home \ - --container-workdir=/workspace/ \ + --container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b300${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh From 9453676d638370bd8c1ae9ab8c60c58829974ce7 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 10:58:09 -0500 Subject: [PATCH 13/24] runner: use /data/models pre-staged path for dsv4 on B300 Pre-staged models on the B300 cluster live under /data/models (Qwen3.5-397B-A17B-FP8, dsv4-pro, etc.). Switch HF_HUB_CACHE_MOUNT from /scratch/models to /data/models, and export MODEL to /data/models/dsv4-pro when MODEL_PREFIX=dsv4 so the benchmark reads from the mounted dir directly. The bench script skips `hf download` when MODEL looks like an absolute path. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b300.sh | 6 +++++- runners/launch_b300-nv.sh | 13 ++++++++----- 2 files changed, 13 insertions(+), 6 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh index 90c0e681a..8ccbb9ead 100755 --- a/benchmarks/single_node/dsv4_fp4_b300.sh +++ b/benchmarks/single_node/dsv4_fp4_b300.sh @@ -23,7 +23,11 @@ if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" fi -hf download "$MODEL" +# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip +# `hf download`. Only fetch when MODEL looks like a HF repo ID. +if [[ "$MODEL" != /* ]]; then + hf download "$MODEL" +fi nvidia-smi diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 8ce5481ba..76586238f 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -248,12 +248,15 @@ find . -name '.nfs*' -delete 2>/dev/null || true else - HF_HUB_CACHE_MOUNT="/scratch/models" - # Qwen3.5-397B-A17B-FP8 is pre-staged under /scratch/models on the B300 cluster, - # so point MODEL at the local copy. Other models fall through and use `hf download` - # against the mounted cache from their benchmark script. + # Pre-staged models on the B300 cluster live under /data/models. Point MODEL + # at the local copy so the benchmark skips `hf download` and reads from the + # mounted dir. Other models fall through and use `hf download` from their + # benchmark script. + HF_HUB_CACHE_MOUNT="/data/models" if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then - export MODEL="/scratch/models/${MODEL#*/}" + export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}" + elif [[ "$MODEL_PREFIX" == "dsv4" ]]; then + export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro" fi SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') From 5db43b8b63c5f6affd71ae8a27c3c62d3aef5626 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 14:18:00 -0500 Subject: [PATCH 14/24] fix: switch B300 dsv4 sglang to bw-ultra-compiled image The stock lmsysorg/sglang:deepseek-v4-blackwell image ships kernels compiled for B200 (SM_100) and crashes on B300 with RuntimeError: RMSNorm failed with error code no kernel image is available for execution on the device during CUDA graph capture. Switch to cquil/sglang-deepseek-v4-bw-ultra:v1, which is recompiled with B300 SM support. Broaden the /ix mount conditional to match both image tags: the fork keeps the same /workspace/sglang editable install that would otherwise be masked by $GITHUB_WORKSPACE:/workspace/. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 2 +- runners/launch_b300-nv.sh | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index ea7ff6e9f..294cfe47f 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1802,7 +1802,7 @@ dsr1-fp8-b300-sglang: # until a B300-specific recipe ships. Prefix caching is disabled. # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: - image: lmsysorg/sglang:deepseek-v4-blackwell + image: cquil/sglang-deepseek-v4-bw-ultra:v1 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 76586238f..cc357015c 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -263,12 +263,13 @@ else SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') LOCK_FILE="${SQUASH_FILE}.lock" - # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at - # /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so - # the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and - # breaks `import sglang`. Mount this one image at /ix instead; drop the + # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell (and its B300-recompiled + # fork cquil/sglang-deepseek-v4-bw-ultra) installs sglang editable at + # /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), + # so the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install + # and breaks `import sglang`. Mount these images at /ix instead; drop the # conditional once the image stops installing editable under /workspace. - if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then + if [[ "$IMAGE" == *deepseek-v4-blackwell* || "$IMAGE" == *deepseek-v4-bw-ultra* ]]; then CONTAINER_MOUNT_DIR=/ix else CONTAINER_MOUNT_DIR=/workspace From c060c58dae0a8fa6b8576e48ccf9e88a1d8a75a5 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 14:43:11 -0500 Subject: [PATCH 15/24] fix: switch B300 dsv4 sglang image to yhyang201/sglang-b300:v3 Use the B300-recompiled image from yhyang201; extend the /ix mount conditional to match the new tag in addition to the previous deepseek-v4-blackwell / deepseek-v4-bw-ultra patterns. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 2 +- runners/launch_b300-nv.sh | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 294cfe47f..11c1a43f0 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1802,7 +1802,7 @@ dsr1-fp8-b300-sglang: # until a B300-specific recipe ships. Prefix caching is disabled. # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: - image: cquil/sglang-deepseek-v4-bw-ultra:v1 + image: yhyang201/sglang-b300:v3 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index cc357015c..600912877 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -263,13 +263,13 @@ else SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') LOCK_FILE="${SQUASH_FILE}.lock" - # TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell (and its B300-recompiled - # fork cquil/sglang-deepseek-v4-bw-ultra) installs sglang editable at - # /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), + # TODO(Cam): the deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell + # and its B300-recompiled forks like yhyang201/sglang-b300) install sglang + # editable at /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), # so the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install # and breaks `import sglang`. Mount these images at /ix instead; drop the # conditional once the image stops installing editable under /workspace. - if [[ "$IMAGE" == *deepseek-v4-blackwell* || "$IMAGE" == *deepseek-v4-bw-ultra* ]]; then + if [[ "$IMAGE" == *deepseek-v4-blackwell* || "$IMAGE" == *deepseek-v4-bw-ultra* || "$IMAGE" == *sglang-b300* ]]; then CONTAINER_MOUNT_DIR=/ix else CONTAINER_MOUNT_DIR=/workspace From 08edf26c59c3735ef4c01a41539fd155fcc39663 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 14:44:34 -0500 Subject: [PATCH 16/24] update b300 --- benchmarks/single_node/dsv4_fp4_b300.sh | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh index 8ccbb9ead..57932e929 100755 --- a/benchmarks/single_node/dsv4_fp4_b300.sh +++ b/benchmarks/single_node/dsv4_fp4_b300.sh @@ -61,15 +61,17 @@ fi start_gpu_monitor set -x -PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \ ---trust-remote-code \ ---tensor-parallel-size=$TP --ep-size $EP_SIZE $DP_ATTN_ARGS \ ---moe-a2a-backend deepep \ ---deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' \ ---mem-fraction-static 0.82 \ ---cuda-graph-max-bs 64 \ ---max-running-requests 256 \ ---disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & +PYTHONNOUSERSITE=1 sglang serve \ + --model-path $MODEL \ + --host 0.0.0.0 \ + --port $PORT \ + --trust-remote-code \ + --tp $TP \ + --moe-runner-backend flashinfer_mxfp4 \ + --mem-fraction-static 0.82 \ + --chunked-prefill-size 4096 \ + --disable-flashinfer-autotune \ + --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! From a699ca091a331e5b7814c3695f7b79102fd5ac80 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 15:50:03 -0500 Subject: [PATCH 17/24] feat(dsv4-fp4-b300-sglang): pick recipe by CONC; split search-space Mirror chore/dsv4-sgl-b200 commits 103a202c + 43be495b for B300: Bench script now selects one of three cookbook recipes by CONC instead of a single static flag set: CONC <= 32 -> low-latency (TP only, chunked-prefill 4096, disable-flashinfer-autotune) 33..128 -> balanced (+ DP-attention, max-running-reqs=128, cuda-graph-max-bs=64, deepep-config) CONC > 128 -> max-throughput (+ DP-attention, max-running-reqs=256, cuda-graph-max-bs=64, deepep-config) No speculative decoding in any recipe; --disable-radix-cache kept for the no-prefix-caching baseline. Split the dsv4-fp4-b300-sglang search-space rows per recipe boundary so result filenames (ep=, dpa=) accurately reflect which recipe ran. ep=8 on balanced/max-throughput reflects sglang's implicit ep_size=tp_size override when --moe-a2a-backend deepep is set. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 22 ++++++- benchmarks/single_node/dsv4_fp4_b300.sh | 87 ++++++++++++++++--------- 2 files changed, 78 insertions(+), 31 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 11c1a43f0..c9a3368cb 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1809,15 +1809,33 @@ dsv4-fp4-b300-sglang: precision: fp4 framework: sglang multinode: false + # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 + # are selected inside benchmarks/single_node/dsv4_fp4_b300.sh by CONC: + # low-latency (CONC <= 32): TP-only + # balanced (32 < CONC <= 128): + DP-attn + # max-throughput (CONC > 128): + DP-attn + # Split so result filenames (ep=, dpa=) accurately reflect the recipe. + # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, + # while low-latency leaves ep_size at the default of 1. seq-len-configs: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 1024 } + # low-latency + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + # balanced + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } + # max-throughput + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 8, dp-attn: true, conc-start: 4, conc-end: 512 } + # low-latency + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + # balanced + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } + # max-throughput + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh index 57932e929..faa946174 100755 --- a/benchmarks/single_node/dsv4_fp4_b300.sh +++ b/benchmarks/single_node/dsv4_fp4_b300.sh @@ -1,11 +1,5 @@ #!/usr/bin/env bash -# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# only ships a B200 recipe for Blackwell. This script reuses the B200 -# DeepSeek-V4-Pro Max-Throughput recipe (DP=8 + DeepEP, no MTP) as-is on -# B300 until a B300-specific recipe ships. Parallelism and concurrency -# ranges mirror dsv4-fp4-b200-vllm. Prefix caching is disabled. - source "$(dirname "$0")/../benchmark_lib.sh" check_env_vars \ @@ -15,9 +9,7 @@ check_env_vars \ ISL \ OSL \ RANDOM_RANGE_RATIO \ - RESULT_FILENAME \ - EP_SIZE \ - DP_ATTENTION + RESULT_FILENAME if [[ -n "$SLURM_JOB_ID" ]]; then echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" @@ -32,25 +24,23 @@ fi nvidia-smi export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 -export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 -# The deepseek-v4-blackwell image bakes CUDA_VISIBLE_DEVICES=4,5,6,7 into its ENV, -# which masks half of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to -# all ranks. +# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its +# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half +# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks. unset CUDA_VISIBLE_DEVICES -# The runner mounts this repo at a non-/workspace path for the deepseek-v4-blackwell -# image (it installs sglang editable under /workspace/sglang, which our bind-mount -# would hide), so write artefacts relative to $PWD instead of a hard-coded /workspace. +# TODO(Cam): the deepseek-v4 sglang images install sglang editable at +# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. +# The runner mounts our repo at a non-/workspace path for these images so the +# editable install stays visible. Paths in this script are $PWD-relative for +# that reason. Drop the runner conditional once lmsys moves sglang back out of +# /workspace. + SERVER_LOG="$PWD/server.log" PORT=${PORT:-8888} -echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL" - -DP_ATTN_ARGS="" -if [ "$DP_ATTENTION" = "true" ]; then - DP_ATTN_ARGS="--data-parallel-size $TP --enable-dp-attention" -fi +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" EVAL_CONTEXT_ARGS="" if [ "${EVAL_ONLY}" = "true" ]; then @@ -58,7 +48,49 @@ if [ "${EVAL_ONLY}" = "true" ]; then EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" fi -start_gpu_monitor +start_gpu_monitor --output "$PWD/gpu_metrics.csv" + +# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# (spec-decoding / MTP and prefix-caching flags dropped for the baseline): +# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune +# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 +# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 +DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + +if [[ $CONC -le 32 ]]; then + RECIPE=low-latency + RECIPE_FLAGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 4096 + --disable-flashinfer-autotune + --mem-fraction-static 0.82 + ) +elif [[ $CONC -le 128 ]]; then + RECIPE=balanced + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 + --cuda-graph-max-bs 64 + --max-running-requests 128 + ) +else + RECIPE=max-throughput + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 + --cuda-graph-max-bs 64 + --max-running-requests 256 + ) +fi +echo "Recipe: $RECIPE (CONC=$CONC)" set -x PYTHONNOUSERSITE=1 sglang serve \ @@ -67,11 +99,8 @@ PYTHONNOUSERSITE=1 sglang serve \ --port $PORT \ --trust-remote-code \ --tp $TP \ - --moe-runner-backend flashinfer_mxfp4 \ - --mem-fraction-static 0.82 \ - --chunked-prefill-size 4096 \ - --disable-flashinfer-autotune \ - --disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + --disable-radix-cache \ + "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & SERVER_PID=$! @@ -86,7 +115,7 @@ run_benchmark_serving \ --input-len "$ISL" \ --output-len "$OSL" \ --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts "$((CONC * 10))" \ + --num-prompts $((CONC * 10)) \ --max-concurrency "$CONC" \ --result-filename "$RESULT_FILENAME" \ --result-dir "$PWD/" From d35696cab3b0e1c51f6ae2334b0a0c36b058e62c Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 15:56:58 -0500 Subject: [PATCH 18/24] update b300 Switch B300 dsv4 sglang image to lmsysorg/sglang:deepseek-v4-b300 and extend the /ix mount conditional to match the new tag. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 2 +- runners/launch_b300-nv.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index c9a3368cb..1c9f9beba 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1802,7 +1802,7 @@ dsr1-fp8-b300-sglang: # until a B300-specific recipe ships. Prefix caching is disabled. # Parallelisms and concurrency ranges mirror dsv4-fp4-b200-vllm. dsv4-fp4-b300-sglang: - image: yhyang201/sglang-b300:v3 + image: lmsysorg/sglang:deepseek-v4-b300 model: deepseek-ai/DeepSeek-V4-Pro model-prefix: dsv4 runner: b300 diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index 600912877..3daac0167 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -269,7 +269,7 @@ else # so the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install # and breaks `import sglang`. Mount these images at /ix instead; drop the # conditional once the image stops installing editable under /workspace. - if [[ "$IMAGE" == *deepseek-v4-blackwell* || "$IMAGE" == *deepseek-v4-bw-ultra* || "$IMAGE" == *sglang-b300* ]]; then + if [[ "$IMAGE" == *deepseek-v4-blackwell* || "$IMAGE" == *deepseek-v4-bw-ultra* || "$IMAGE" == *deepseek-v4-b300* || "$IMAGE" == *sglang-b300* ]]; then CONTAINER_MOUNT_DIR=/ix else CONTAINER_MOUNT_DIR=/workspace From bc43672775655dee5e1e5666bb6f03cb5d876e5e Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Fri, 24 Apr 2026 16:40:55 -0500 Subject: [PATCH 19/24] feat(dsv4-fp4-b300-sglang): hardcode low-latency recipe at every CONC The DeepEP FP8 weight-postprocess path is broken for deepseek-ai/DeepSeek-V4-Pro on B300 with lmsysorg/sglang:deepseek-v4-b300 -- every sglang launch with --moe-a2a-backend deepep fails during model load with RuntimeError: Recipe must be a list/tuple of 3 integers. raised from sglang.srt.layers.quantization.fp8 .process_weights_after_loading_block_quant (fp8.py:957). The balanced and max-throughput recipes both go through that path; the low-latency recipe (TP-only, flashinfer_mxfp4 MoE) does not and loads cleanly. Collapse the yaml search-space back to a single row spanning the full CONC range (4..1024 for 1k1k, 4..512 for 8k1k) and hardcode the bench script to the low-latency flags at every CONC. TODO(Cam) noted in both files to restore the recipe-per-CONC dispatch once the DeepEP FP8 load path is fixed upstream. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/configs/nvidia-master.yaml | 29 +++++--------- benchmarks/single_node/dsv4_fp4_b300.sh | 53 ++++++------------------- 2 files changed, 22 insertions(+), 60 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1c9f9beba..ea71490bd 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1809,33 +1809,22 @@ dsv4-fp4-b300-sglang: precision: fp4 framework: sglang multinode: false - # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 - # are selected inside benchmarks/single_node/dsv4_fp4_b300.sh by CONC: - # low-latency (CONC <= 32): TP-only - # balanced (32 < CONC <= 128): + DP-attn - # max-throughput (CONC > 128): + DP-attn - # Split so result filenames (ep=, dpa=) accurately reflect the recipe. - # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, - # while low-latency leaves ep_size at the default of 1. + # TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP) + # while the DeepEP FP8 weight-postprocess path is broken for this + # checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3 + # integers. raised from sglang.srt.layers.quantization.fp8 + # .process_weights_after_loading_block_quant). Full concurrency sweep + # retained; restore the recipe-per-CONC split (balanced + max-throughput + # rows) once sglang can load the checkpoint under --moe-a2a-backend deepep. seq-len-configs: - isl: 1024 osl: 1024 search-space: - # low-latency - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - # balanced - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } - # max-throughput - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - # low-latency - - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } - # balanced - - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } - # max-throughput - - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } + - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh index faa946174..79856c2ec 100755 --- a/benchmarks/single_node/dsv4_fp4_b300.sh +++ b/benchmarks/single_node/dsv4_fp4_b300.sh @@ -50,46 +50,19 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# (spec-decoding / MTP and prefix-caching flags dropped for the baseline): -# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune -# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 -# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 -DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - -if [[ $CONC -le 32 ]]; then - RECIPE=low-latency - RECIPE_FLAGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 4096 - --disable-flashinfer-autotune - --mem-fraction-static 0.82 - ) -elif [[ $CONC -le 128 ]]; then - RECIPE=balanced - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 - RECIPE_FLAGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.82 - --cuda-graph-max-bs 64 - --max-running-requests 128 - ) -else - RECIPE=max-throughput - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 - RECIPE_FLAGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.82 - --cuda-graph-max-bs 64 - --max-running-requests 256 - ) -fi +# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the +# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300 +# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from +# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant). +# Restore the CONC-based low-latency / balanced / max-throughput dispatch +# once sglang can load the checkpoint under --moe-a2a-backend deepep. +RECIPE=low-latency +RECIPE_FLAGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 4096 + --disable-flashinfer-autotune + --mem-fraction-static 0.82 +) echo "Recipe: $RECIPE (CONC=$CONC)" set -x From 87c83764218be53bd4b5079d583b299a0c7e0792 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sat, 25 Apr 2026 00:11:03 -0500 Subject: [PATCH 20/24] trigger test check From 90e8f3d8c32e04100eb1dc3635e5a3d82ab1ad88 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sat, 25 Apr 2026 00:25:02 -0500 Subject: [PATCH 21/24] Revert "feat(dsv4-fp4-b300-sglang): hardcode low-latency recipe at every CONC" This reverts commit bc43672775655dee5e1e5666bb6f03cb5d876e5e. --- .github/configs/nvidia-master.yaml | 29 +++++++++----- benchmarks/single_node/dsv4_fp4_b300.sh | 53 +++++++++++++++++++------ 2 files changed, 60 insertions(+), 22 deletions(-) diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index bd8d5bddc..42c720a63 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1845,22 +1845,33 @@ dsv4-fp4-b300-sglang: precision: fp4 framework: sglang multinode: false - # TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP) - # while the DeepEP FP8 weight-postprocess path is broken for this - # checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3 - # integers. raised from sglang.srt.layers.quantization.fp8 - # .process_weights_after_loading_block_quant). Full concurrency sweep - # retained; restore the recipe-per-CONC split (balanced + max-throughput - # rows) once sglang can load the checkpoint under --moe-a2a-backend deepep. + # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 + # are selected inside benchmarks/single_node/dsv4_fp4_b300.sh by CONC: + # low-latency (CONC <= 32): TP-only + # balanced (32 < CONC <= 128): + DP-attn + # max-throughput (CONC > 128): + DP-attn + # Split so result filenames (ep=, dpa=) accurately reflect the recipe. + # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, + # while low-latency leaves ep_size at the default of 1. seq-len-configs: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 } + # low-latency + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + # balanced + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } + # max-throughput + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 } + # low-latency + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + # balanced + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } + # max-throughput + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh index 79856c2ec..faa946174 100755 --- a/benchmarks/single_node/dsv4_fp4_b300.sh +++ b/benchmarks/single_node/dsv4_fp4_b300.sh @@ -50,19 +50,46 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the -# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300 -# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from -# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant). -# Restore the CONC-based low-latency / balanced / max-throughput dispatch -# once sglang can load the checkpoint under --moe-a2a-backend deepep. -RECIPE=low-latency -RECIPE_FLAGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 4096 - --disable-flashinfer-autotune - --mem-fraction-static 0.82 -) +# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# (spec-decoding / MTP and prefix-caching flags dropped for the baseline): +# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune +# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 +# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 +DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + +if [[ $CONC -le 32 ]]; then + RECIPE=low-latency + RECIPE_FLAGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 4096 + --disable-flashinfer-autotune + --mem-fraction-static 0.82 + ) +elif [[ $CONC -le 128 ]]; then + RECIPE=balanced + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 + --cuda-graph-max-bs 64 + --max-running-requests 128 + ) +else + RECIPE=max-throughput + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 + --cuda-graph-max-bs 64 + --max-running-requests 256 + ) +fi echo "Recipe: $RECIPE (CONC=$CONC)" set -x From 8e3158d4cabf90f995c5cfa5dd0f918dbe782012 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sat, 25 Apr 2026 00:34:52 -0500 Subject: [PATCH 22/24] trigger test check From 623baa1a4dc8ce91fd86bd6a926dd6d76593125d Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sat, 25 Apr 2026 01:40:21 -0500 Subject: [PATCH 23/24] Move dsv4 b300 sglang bench script to framework-tagged path Per the runner naming convention introduced in #1146 (BENCH_SCRIPT="${BENCH_BASE}_${FRAMEWORK}${SPEC_SUFFIX}.sh"), the b300 runner now prefers benchmarks/single_node/dsv4_fp4_b300_sglang.sh over the legacy dsv4_fp4_b300.sh. The merge from main left this branch with both scripts: the legacy file carrying the recipe-per-CONC dispatch this PR added, and the framework-tagged file with the low-latency-only fallback content from main. CI was therefore picking the wrong script. Move the recipe-per-CONC dispatch onto dsv4_fp4_b300_sglang.sh and delete the legacy filename so the runner picks up the intended logic. Update the yaml comment to point at the new path. Co-Authored-By: Claude Opus 4.7 (1M context) --- benchmarks/single_node/dsv4_fp4_b300.sh | 129 ------------------ .../single_node/dsv4_fp4_b300_sglang.sh | 54 ++++++-- 2 files changed, 40 insertions(+), 143 deletions(-) delete mode 100755 benchmarks/single_node/dsv4_fp4_b300.sh diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh deleted file mode 100755 index faa946174..000000000 --- a/benchmarks/single_node/dsv4_fp4_b300.sh +++ /dev/null @@ -1,129 +0,0 @@ -#!/usr/bin/env bash - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip -# `hf download`. Only fetch when MODEL looks like a HF repo ID. -if [[ "$MODEL" != /* ]]; then - hf download "$MODEL" -fi - -nvidia-smi - -export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 - -# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its -# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half -# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks. -unset CUDA_VISIBLE_DEVICES - -# TODO(Cam): the deepseek-v4 sglang images install sglang editable at -# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. -# The runner mounts our repo at a non-/workspace path for these images so the -# editable install stays visible. Paths in this script are $PWD-relative for -# that reason. Drop the runner conditional once lmsys moves sglang back out of -# /workspace. - -SERVER_LOG="$PWD/server.log" -PORT=${PORT:-8888} - -echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" - -EVAL_CONTEXT_ARGS="" -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" -fi - -start_gpu_monitor --output "$PWD/gpu_metrics.csv" - -# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 -# (spec-decoding / MTP and prefix-caching flags dropped for the baseline): -# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune -# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 -# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 -DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' - -if [[ $CONC -le 32 ]]; then - RECIPE=low-latency - RECIPE_FLAGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 4096 - --disable-flashinfer-autotune - --mem-fraction-static 0.82 - ) -elif [[ $CONC -le 128 ]]; then - RECIPE=balanced - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 - RECIPE_FLAGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.82 - --cuda-graph-max-bs 64 - --max-running-requests 128 - ) -else - RECIPE=max-throughput - export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 - RECIPE_FLAGS=( - --dp-size "$TP" - --enable-dp-attention - --moe-a2a-backend deepep - --deepep-config "$DEEPEP_CONFIG" - --mem-fraction-static 0.82 - --cuda-graph-max-bs 64 - --max-running-requests 256 - ) -fi -echo "Recipe: $RECIPE (CONC=$CONC)" - -set -x -PYTHONNOUSERSITE=1 sglang serve \ - --model-path $MODEL \ - --host 0.0.0.0 \ - --port $PORT \ - --trust-remote-code \ - --tp $TP \ - --disable-radix-cache \ - "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts $((CONC * 10)) \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir "$PWD/" - -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -stop_gpu_monitor -set +x diff --git a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh index c9fb238a5..faa946174 100755 --- a/benchmarks/single_node/dsv4_fp4_b300_sglang.sh +++ b/benchmarks/single_node/dsv4_fp4_b300_sglang.sh @@ -50,20 +50,46 @@ fi start_gpu_monitor --output "$PWD/gpu_metrics.csv" -# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the -# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300 -# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from -# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant). -# Restore the CONC-based low-latency / balanced / max-throughput dispatch -# on chore/dsv4-sgl-b300 once sglang can load the checkpoint under -# --moe-a2a-backend deepep. -RECIPE=low-latency -RECIPE_FLAGS=( - --moe-runner-backend flashinfer_mxfp4 - --chunked-prefill-size 4096 - --disable-flashinfer-autotune - --mem-fraction-static 0.82 -) +# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# (spec-decoding / MTP and prefix-caching flags dropped for the baseline): +# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune +# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 +# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 +DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + +if [[ $CONC -le 32 ]]; then + RECIPE=low-latency + RECIPE_FLAGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 4096 + --disable-flashinfer-autotune + --mem-fraction-static 0.82 + ) +elif [[ $CONC -le 128 ]]; then + RECIPE=balanced + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 + --cuda-graph-max-bs 64 + --max-running-requests 128 + ) +else + RECIPE=max-throughput + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 + --cuda-graph-max-bs 64 + --max-running-requests 256 + ) +fi echo "Recipe: $RECIPE (CONC=$CONC)" set -x From 54b2ced5e80684c02c999dd3da8d61c5bb44a838 Mon Sep 17 00:00:00 2001 From: Cam Quilici Date: Sat, 25 Apr 2026 01:43:18 -0500 Subject: [PATCH 24/24] chore(perf-changelog): tighten dsv4-fp4-b300-sglang entry Now that DeepEP FP8 loads cleanly, this PR is purely about restoring the recipe-per-CONC split on top of the low-latency-only fallback from #1143. Trim the changelog to that delta. Co-Authored-By: Claude Opus 4.7 (1M context) --- perf-changelog.yaml | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 70593a980..397da6591 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1816,10 +1816,6 @@ - config-keys: - dsv4-fp4-b300-sglang description: - - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark" - - "Image: lmsysorg/sglang:deepseek-v4-blackwell" - - "Model: deepseek-ai/DeepSeek-V4-Pro (FP4 MoE experts + FP8 attention/dense)" - - "Reuses the B200 Pro Max-Throughput recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 on B300 until a B300-specific recipe ships" - - "DP=8 + DeepEP, prefix caching disabled, no speculative decoding" - - "Parallelism (TP=8/EP=8/dp-attn=true) and concurrency ranges (4-1024 for 1k1k, 4-512 for 8k1k) mirror dsv4-fp4-b200-vllm" + - "Restore the recipe-per-CONC split (low-latency / balanced / max-throughput) on top of the low-latency-only fallback from #1143; the DeepEP FP8 weight-postprocess path is fixed, so the high-throughput scenario runs again" + - "Recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132