Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
26e540d
feat: add DeepSeek-V4-Flash FP4 B300 SGLang benchmark
cquil11 Apr 24, 2026
efdc8ba
fix: switch dsv4-fp4-b300-sglang to Pro + Max-Throughput recipe
cquil11 Apr 24, 2026
cc35a12
chore: sync launch_b200-dgxc-slurm.sh cache mount from claude/add-dsv…
cquil11 Apr 24, 2026
404a097
fix: restore trailing whitespace stripped from glm5.1 changelog entry
cquil11 Apr 24, 2026
97a488e
chore: add flock-guarded squash import to B300 runner
cquil11 Apr 24, 2026
106deea
fix: drop ENROOT_CACHE_PATH override from B300 runner
cquil11 Apr 24, 2026
4bb1f1a
chore: point B300 runner at shared gharunners/{squash,hf-hub-cache}
cquil11 Apr 24, 2026
744c5a0
fix: move enroot import out of srun to avoid pyxis namespace collision
cquil11 Apr 24, 2026
d003c59
fix: wipe stale pyxis scratch dirs for this JOB_ID before benchmark srun
cquil11 Apr 24, 2026
f00629f
Revert: drop all B300 runner changes, mirror #1128's approach
cquil11 Apr 24, 2026
570b0eb
runner: add head-node flock-guarded squash import on B300
cquil11 Apr 24, 2026
864419d
fix: mount at /ix and clear baked-in CUDA_VISIBLE_DEVICES
cquil11 Apr 24, 2026
5d93913
Merge branch 'main' into chore/dsv4-sgl-b300
cquil11 Apr 24, 2026
9453676
runner: use /data/models pre-staged path for dsv4 on B300
cquil11 Apr 24, 2026
5db43b8
fix: switch B300 dsv4 sglang to bw-ultra-compiled image
cquil11 Apr 24, 2026
c060c58
fix: switch B300 dsv4 sglang image to yhyang201/sglang-b300:v3
cquil11 Apr 24, 2026
08edf26
update b300
cquil11 Apr 24, 2026
a699ca0
feat(dsv4-fp4-b300-sglang): pick recipe by CONC; split search-space
cquil11 Apr 24, 2026
d35696c
update b300
cquil11 Apr 24, 2026
c3b562c
feat(dsv4-fp4-b300-sglang): low-latency recipe at every CONC (fallback)
cquil11 Apr 24, 2026
410df74
fix: align perf-changelog and config comments with low-latency fallback
github-actions[bot] Apr 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1796,6 +1796,34 @@ dsr1-fp8-b300-sglang:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }

# NOTE: Low-latency fallback (TP=8, EP=1, no DP-attn, no DeepEP) while
# the DeepEP FP8 weight-postprocess path is broken for DeepSeek-V4-Pro
# on B300. Re-introduce balanced/max-throughput rows once fixed upstream.
dsv4-fp4-b300-sglang:
image: lmsysorg/sglang:deepseek-v4-b300
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b300
precision: fp4
framework: sglang
multinode: false
# TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP)
# while the DeepEP FP8 weight-postprocess path is broken for this
# checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3
# integers. raised from sglang.srt.layers.quantization.fp8
# .process_weights_after_loading_block_quant). Full concurrency sweep
# retained; revert to the recipe-per-CONC split on chore/dsv4-sgl-b300
# once sglang can load the checkpoint under --moe-a2a-backend deepep.
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 512 }

qwen3.5-bf16-b200-sglang:
image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
model: Qwen/Qwen3.5-397B-A17B
Expand Down
103 changes: 103 additions & 0 deletions benchmarks/single_node/dsv4_fp4_b300.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip
# `hf download`. Only fetch when MODEL looks like a HF repo ID.
if [[ "$MODEL" != /* ]]; then
hf download "$MODEL"
fi

nvidia-smi

export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0

# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its
# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half
# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks.
unset CUDA_VISIBLE_DEVICES

# TODO(Cam): the deepseek-v4 sglang images install sglang editable at
# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
# The runner mounts our repo at a non-/workspace path for these images so the
# editable install stays visible. Paths in this script are $PWD-relative for
# that reason. Drop the runner conditional once lmsys moves sglang back out of
# /workspace.

SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the
# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300
# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from
# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant).
# Restore the CONC-based low-latency / balanced / max-throughput dispatch
# on chore/dsv4-sgl-b300 once sglang can load the checkpoint under
# --moe-a2a-backend deepep.
RECIPE=low-latency
RECIPE_FLAGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 4096
--disable-flashinfer-autotune
--mem-fraction-static 0.82
)
echo "Recipe: $RECIPE (CONC=$CONC)"

set -x
PYTHONNOUSERSITE=1 sglang serve \
--model-path $MODEL \
--host 0.0.0.0 \
--port $PORT \
--trust-remote-code \
--tp $TP \
--disable-radix-cache \
"${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts $((CONC * 10)) \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir "$PWD/"

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
11 changes: 11 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1745,3 +1745,14 @@
- "VLLM_ENGINE_READY_TIMEOUT_S=3600 to accommodate large weight loading"
- "Configs: 1k1k conc 4-64, 8k1k conc 4-64"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1130

- config-keys:
- dsv4-fp4-b300-sglang
description:
- "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark (low-latency fallback)"
- "Image: lmsysorg/sglang:deepseek-v4-b300"
- "Model: deepseek-ai/DeepSeek-V4-Pro"
- "Low-latency only (TP=8, EP=1, no DP-attn, no DeepEP) — DeepEP FP8 weight-postprocess path is broken for this checkpoint on B300"
- "Prefix caching disabled, no speculative decoding"
- "Configs: 1k1k conc 4-1024, 8k1k conc 4-512"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1143
5 changes: 2 additions & 3 deletions runners/launch_b200-dgxc-slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -249,8 +249,7 @@ EOF

else

HF_HUB_CACHE_MOUNT="/scratch/fsw/models"
export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}"
HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache"
SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
Expand All @@ -276,7 +275,7 @@ else

srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--no-container-mount-home \
--container-workdir=/workspace/ \
--no-container-entrypoint --export=ALL,PORT=8888 \
Expand Down
48 changes: 38 additions & 10 deletions runners/launch_b300-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -248,29 +248,57 @@ find . -name '.nfs*' -delete 2>/dev/null || true

else

HF_HUB_CACHE_MOUNT="/scratch/models"
# Qwen3.5-397B-A17B-FP8 is pre-staged under /scratch/models on the B300 cluster,
# so point MODEL at the local copy. Other models fall through and use `hf download`
# against the mounted cache from their benchmark script.
# Pre-staged models on the B300 cluster live under /data/models. Point MODEL
# at the local copy so the benchmark skips `hf download` and reads from the
# mounted dir. Other models fall through and use `hf download` from their
# benchmark script.
HF_HUB_CACHE_MOUNT="/data/models"
if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then
export MODEL="/scratch/models/${MODEL#*/}"
export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}"
elif [[ "$MODEL_PREFIX" == "dsv4" ]]; then
export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro"
fi
SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
LOCK_FILE="${SQUASH_FILE}.lock"

# TODO(Cam): the deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell
# and its B300-recompiled forks like yhyang201/sglang-b300) install sglang
# editable at /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang),
# so the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install
# and breaks `import sglang`. Mount these images at /ix instead; drop the
# conditional once the image stops installing editable under /workspace.
if [[ "$IMAGE" == *deepseek-v4-blackwell* || "$IMAGE" == *deepseek-v4-bw-ultra* || "$IMAGE" == *deepseek-v4-b300* || "$IMAGE" == *sglang-b300* ]]; then
CONTAINER_MOUNT_DIR=/ix
else
CONTAINER_MOUNT_DIR=/workspace
fi

# Import the squash file on the head node (outside any srun) under flock.
# Parallel GH jobs target the same shared squash path; flock serializes
# imports so only one job pulls and writes the file while the rest wait.
(
exec 9>"$LOCK_FILE"
flock -w 600 9 || { echo "Failed to acquire lock for $SQUASH_FILE" >&2; exit 1; }
if unsquashfs -l "$SQUASH_FILE" > /dev/null 2>&1; then
echo "Squash file already exists and is valid, skipping import"
else
rm -f "$SQUASH_FILE"
enroot import -o "$SQUASH_FILE" "docker://$IMAGE"
fi
)

# Pin to one of the known-good B300 nodes; others have hardware/network
# issues that cause benchmarks to hang or fail to start.
salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)

srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"

srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
--no-container-mount-home \
--container-workdir=/workspace/ \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL,PORT=8888 \
bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b300${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh

Expand Down
Loading