Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
f9a0ed9
Add dsv4-fp4-b200-sglang single-node config
cquil11 Apr 24, 2026
44a1c1f
Switch dsv4-fp4-b200-sglang to Pro model, match vllm parallelism
cquil11 Apr 24, 2026
c21ee5c
Match DSV4 Pro SGLang recipe literally; port HF cache path
cquil11 Apr 24, 2026
0399773
fix: use 'sglang serve' CLI, not python -m sglang.launch_server
cquil11 Apr 24, 2026
4a3e3e9
fix: mount repo at /ix for deepseek-v4-blackwell image
cquil11 Apr 24, 2026
ffd0874
fix: reinstall sglang from PyPI to work around masked editable install
cquil11 Apr 24, 2026
fef260f
fix: uninstall editable sglang before reinstalling from PyPI
cquil11 Apr 24, 2026
da148a1
fix: mount repo at /ix for deepseek-v4-blackwell; drop pip workaround
cquil11 Apr 24, 2026
95eb527
fix: unset baked-in CUDA_VISIBLE_DEVICES for deepseek-v4-blackwell image
cquil11 Apr 24, 2026
9a3457a
fix: apply same /ix mount fix to launch_b200-nb.sh
cquil11 Apr 24, 2026
9779d14
Drop --container-name arg from launch_b200-nb.sh
cquil11 Apr 24, 2026
fe012a7
change runner
cquil11 Apr 24, 2026
151a62f
update recipe
cquil11 Apr 24, 2026
d96a2b0
Fix launch_b200-cw.sh and add b200-cw to runners pool
cquil11 Apr 24, 2026
ffd8e47
update recipe
cquil11 Apr 24, 2026
3a354ef
update model storage to nvme
cquil11 Apr 24, 2026
a425131
fix(launch_b200-cw): skip realpath on worker-local squash; drop stale…
cquil11 Apr 24, 2026
103a202
feat(dsv4_fp4_b200): pick recipe (low-latency/balanced/max-throughput…
cquil11 Apr 24, 2026
4a96602
update b200
cquil11 Apr 24, 2026
43be495
feat(dsv4-fp4-b200-sglang): split search-space per sglang recipe
cquil11 Apr 24, 2026
a2241a5
Merge branch 'main' into chore/dsv4-sgl-b200
cquil11 Apr 24, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1669,6 +1669,42 @@ dsr1-fp4-b200-sglang:
- { tp: 4, ep: 4, conc-start: 4, conc-end: 128 }
- { tp: 8, ep: 8, conc-start: 4, conc-end: 16 }

dsv4-fp4-b200-sglang:
image: lmsysorg/sglang:deepseek-v4-blackwell
model: deepseek-ai/DeepSeek-V4-Pro
model-prefix: dsv4
runner: b200-dsv4
precision: fp4
framework: sglang
multinode: false
# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# are selected inside benchmarks/single_node/dsv4_fp4_b200.sh by CONC:
# low-latency (CONC <= 32): TP-only
# balanced (32 < CONC <= 128): + DP-attn
# max-throughput (CONC > 128): + DP-attn
# Split so result filenames (ep=, dpa=) accurately reflect the recipe.
# ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size,
# while low-latency leaves ep_size at the default of 1.
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
# low-latency
- { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
# balanced
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
# max-throughput
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
# low-latency
- { tp: 8, ep: 1, conc-start: 4, conc-end: 32 }
# balanced
- { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 }
# max-throughput
- { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 }

# NOTE: At the time of submission, https://cookbook.sglang.io/autoregressive/DeepSeek/DeepSeek-R1
# does not have a B300-specific recipe, so this config reuses the existing DSR1 FP4
# B200 SGLang recipe as-is until B300-specific tuning is available.
Expand Down
2 changes: 2 additions & 0 deletions .github/configs/runners.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,8 @@ h200-multinode:
- 'h200-dgxc-slurm_12'
- 'h200-dgxc-slurm_13'
b200:
- 'b200-cw_00'
- 'b200-cw_01'
- 'b200-nb_0'
- 'b200-nb_1'
- 'b200-dgxc-slurm_0'
Expand Down
119 changes: 119 additions & 0 deletions benchmarks/single_node/dsv4_fp4_b200.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#!/usr/bin/env bash

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

hf download "$MODEL"

nvidia-smi

export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0

# TODO(Cam): the lmsysorg/sglang:deepseek-v4-blackwell image installs sglang
# editable at /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang.
# The runner mounts our repo at a non-/workspace path for this image so the editable
# install stays visible. Paths in this script are $PWD-relative for that reason.
# Drop the runner conditional once lmsys moves sglang back out of /workspace.

SERVER_LOG="$PWD/server.log"
PORT=${PORT:-8888}

echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL"

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
fi

start_gpu_monitor --output "$PWD/gpu_metrics.csv"

# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# (spec-decoding / MTP and prefix-caching flags dropped for the baseline):
# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune
# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128
# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256
DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}'

if [[ $CONC -le 32 ]]; then
RECIPE=low-latency
RECIPE_FLAGS=(
--moe-runner-backend flashinfer_mxfp4
--chunked-prefill-size 4096
--disable-flashinfer-autotune
--mem-fraction-static 0.82
)
elif [[ $CONC -le 128 ]]; then
RECIPE=balanced
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
RECIPE_FLAGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--deepep-config "$DEEPEP_CONFIG"
--mem-fraction-static 0.82
--cuda-graph-max-bs 64
--max-running-requests 128
)
else
RECIPE=max-throughput
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
RECIPE_FLAGS=(
--dp-size "$TP"
--enable-dp-attention
--moe-a2a-backend deepep
--deepep-config "$DEEPEP_CONFIG"
--mem-fraction-static 0.82
--cuda-graph-max-bs 64
--max-running-requests 256
)
fi
echo "Recipe: $RECIPE (CONC=$CONC)"

set -x
PYTHONNOUSERSITE=1 sglang serve \
--model-path $MODEL \
--host 0.0.0.0 \
--port $PORT \
--trust-remote-code \
--tp $TP \
--disable-radix-cache \
"${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts $((CONC * 10)) \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir "$PWD/"

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
10 changes: 10 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
@@ -1,3 +1,13 @@
- config-keys:
- dsv4-fp4-b200-sglang
description:
- "Add DeepSeek-V4-Pro single-node B200 SGLang benchmark (TP8, EP8, dp-attention)"
- "Container: lmsysorg/sglang:deepseek-v4-blackwell"
- "Recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4"
Comment thread
cquil11 marked this conversation as resolved.
- "Parallelism and sweep conc ranges match the dsv4-fp4-b200-vllm config"
- "Prefix caching and speculative decoding disabled for baseline numbers"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1131

- config-keys:
- dsr1-fp8-h100-dynamo-trt
- dsr1-fp8-h100-dynamo-sglang
Expand Down
63 changes: 63 additions & 0 deletions runners/launch_b200-cw.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/usr/bin/env bash

export HF_HUB_CACHE_MOUNT="/tmp/gharunner/hf-hub-cache"
export PORT=8888

MODEL_CODE="${EXP_NAME%%_*}"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')

PARTITION="b200"
SQUASH_FILE="/tmp/gharunner/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
LOCK_FILE="${SQUASH_FILE}.lock"

# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
# breaks `import sglang`. Mount this one image at /ix instead; drop the
# conditional once the image stops installing editable under /workspace.
if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
CONTAINER_MOUNT_DIR=/ix
else
CONTAINER_MOUNT_DIR=/workspace
fi

set -x

JOB_ID=$(salloc --partition=$PARTITION --gres=gpu:b200:$TP --time=180 --no-shell --job-name="$RUNNER_NAME" 2>&1 | tee /dev/stderr | grep -oP 'Granted job allocation \K[0-9]+')

if [ -z "$JOB_ID" ]; then
echo "ERROR: salloc failed to allocate a job"
exit 1
fi

# Use Docker image directly for openai/gpt-oss-120b with trt, otherwise use squash file
if [[ "$MODEL" == "openai/gpt-oss-120b" && "$FRAMEWORK" == "trt" ]]; then
CONTAINER_IMAGE=$IMAGE
else
# Use flock to serialize concurrent imports to the same squash file
srun --jobid=$JOB_ID --job-name="$RUNNER_NAME" bash -c "
exec 9>\"$LOCK_FILE\"
flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
echo 'Squash file already exists and is valid, skipping import'
else
rm -f \"$SQUASH_FILE\"
enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
fi
"
# Squash file lives on the allocated worker node's /tmp, which is not
# visible from the host, so realpath on the host would return empty.
# Pass the path as-is; srun resolves it inside the job.
CONTAINER_IMAGE=$SQUASH_FILE
fi

srun --jobid=$JOB_ID \
--container-image=$CONTAINER_IMAGE \
--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--container-mount-home \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL \
bash benchmarks/single_node/${MODEL_CODE}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh

scancel $JOB_ID
15 changes: 13 additions & 2 deletions runners/launch_b200-dgxc-slurm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,17 @@ else
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
LOCK_FILE="${SQUASH_FILE}.lock"

# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
# breaks `import sglang`. Mount this one image at /ix instead; drop the
# conditional once the image stops installing editable under /workspace.
if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
CONTAINER_MOUNT_DIR=/ix
else
CONTAINER_MOUNT_DIR=/workspace
fi

salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)

Expand All @@ -275,9 +286,9 @@ else

srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--no-container-mount-home \
--container-workdir=/workspace/ \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL,PORT=8888 \
bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
fi
16 changes: 13 additions & 3 deletions runners/launch_b200-nb.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,24 @@ SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')

UCX_NET_DEVICES=eth0

# TODO(Cam): lmsysorg/sglang:deepseek-v4-blackwell installs sglang editable at
# /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), so
# the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install and
# breaks `import sglang`. Mount this one image at /ix instead; drop the
# conditional once the image stops installing editable under /workspace.
if [[ "$IMAGE" == *deepseek-v4-blackwell* ]]; then
CONTAINER_MOUNT_DIR=/ix
else
CONTAINER_MOUNT_DIR=/workspace
fi

set -x
srun --partition=$PARTITION --gres=gpu:$TP --exclusive --job-name="$RUNNER_NAME" \
--container-image=$IMAGE \
--container-name=$(echo "$IMAGE" | sed 's/[\/:@#]/_/g')-${USER} \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--no-container-mount-home \
--container-remap-root \
--container-writable \
--container-workdir=/workspace/ \
--container-workdir=$CONTAINER_MOUNT_DIR \
--no-container-entrypoint --export=ALL,PORT=8888,UCX_NET_DEVICES=$UCX_NET_DEVICES \
bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b200${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh
Loading