Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions .github/configs/nvidia-master.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1796,6 +1796,30 @@ dsr1-fp8-b300-sglang:
- { tp: 8, ep: 1, conc-start: 4, conc-end: 4 }
- { tp: 4, ep: 1, conc-start: 4, conc-end: 32 }

# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# lists H200 (not B300) as the FP8 target. Only the Flash-FP8 checkpoint
# is live -- Pro-FP8 is still pending upload in the cookbook. This config
# runs the H200 Flash Max-Throughput recipe (DP + DeepEP, no MTP) on B300
# using the Blackwell image with SGLANG_DSV4_FP4_EXPERTS=0 to swap the
# experts to FP8. Prefix caching is disabled.
dsv4-fp8-b300-sglang:
image: lmsysorg/sglang:deepseek-v4-blackwell
model: sgl-project/DeepSeek-V4-Flash-FP8
model-prefix: dsv4
runner: b300
precision: fp8
framework: sglang
multinode: false
seq-len-configs:
- isl: 1024
osl: 1024
search-space:
- { tp: 4, ep: 4, dp-attn: true, conc-start: 4, conc-end: 1024 }
- isl: 8192
osl: 1024
search-space:
- { tp: 4, ep: 4, dp-attn: true, conc-start: 4, conc-end: 512 }

qwen3.5-bf16-b200-sglang:
image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e
model: Qwen/Qwen3.5-397B-A17B
Expand Down
87 changes: 87 additions & 0 deletions benchmarks/single_node/dsv4_fp8_b300.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
#!/usr/bin/env bash

# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4
# lists H200 (not B300) as the FP8 target, and only the Flash-FP8
# checkpoint is live (Pro-FP8 is still pending upload). This script
# reuses the H200 Flash Max-Throughput recipe (DP + DeepEP, no MTP) on
# B300 using the Blackwell image with SGLANG_DSV4_FP4_EXPERTS=0 to swap
# the experts to FP8. Prefix caching is disabled.

source "$(dirname "$0")/../benchmark_lib.sh"

check_env_vars \
MODEL \
TP \
CONC \
ISL \
OSL \
RANDOM_RANGE_RATIO \
RESULT_FILENAME \
EP_SIZE \
DP_ATTENTION

if [[ -n "$SLURM_JOB_ID" ]]; then
echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME"
fi

hf download "$MODEL"

nvidia-smi

export SGLANG_DSV4_FP4_EXPERTS=0
export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256
export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0

SERVER_LOG=/workspace/server.log
PORT=${PORT:-8888}

echo "TP: $TP, EP_SIZE: $EP_SIZE, DP_ATTENTION: $DP_ATTENTION, CONC: $CONC, ISL: $ISL, OSL: $OSL"

DP_ATTN_ARGS=""
if [ "$DP_ATTENTION" = "true" ]; then
DP_ATTN_ARGS="--data-parallel-size $TP --enable-dp-attention"
fi

EVAL_CONTEXT_ARGS=""
if [ "${EVAL_ONLY}" = "true" ]; then
setup_eval_context
EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN"
fi

start_gpu_monitor

set -x
PYTHONNOUSERSITE=1 python3 -m sglang.launch_server --model-path=$MODEL --host=0.0.0.0 --port=$PORT \
--trust-remote-code \
--tensor-parallel-size=$TP --ep-size $EP_SIZE $DP_ATTN_ARGS \
--moe-a2a-backend deepep \
--deepep-config '{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' \
--cuda-graph-max-bs 128 \
--max-running-requests 256 \
--disable-radix-cache $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 &

SERVER_PID=$!

wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID"

pip install -q datasets pandas

run_benchmark_serving \
--model "$MODEL" \
--port "$PORT" \
--backend vllm \
--input-len "$ISL" \
--output-len "$OSL" \
--random-range-ratio "$RANDOM_RANGE_RATIO" \
--num-prompts "$((CONC * 10))" \
--max-concurrency "$CONC" \
--result-filename "$RESULT_FILENAME" \
--result-dir /workspace/

if [ "${RUN_EVAL}" = "true" ]; then
run_eval --framework lm-eval --port "$PORT"
append_lm_eval_summary
fi

stop_gpu_monitor
set +x
11 changes: 11 additions & 0 deletions perf-changelog.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1733,3 +1733,14 @@
- "TP=2 and TP=4, concurrency 4-256 for 1k1k and 8k1k sequence lengths"
- "Add --max-num-seqs and --gpu-memory-utilization 0.9 to server launch"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1043

- config-keys:
- dsv4-fp8-b300-sglang
description:
- "Add DeepSeek-V4-Flash FP8 B300 SGLang benchmark"
- "Image: lmsysorg/sglang:deepseek-v4-blackwell (with SGLANG_DSV4_FP4_EXPERTS=0 to swap experts to FP8)"
- "Model: sgl-project/DeepSeek-V4-Flash-FP8"
- "Reuses the H200 Flash Max-Throughput recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 on B300; Pro-FP8 checkpoint is still pending upload in the cookbook"
- "DP + DeepEP, prefix caching disabled, no speculative decoding"
- "TP=4/EP=4/dp-attn=true, concurrency 4-1024 for 1k1k and 4-512 for 8k1k"
pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/XXXX
25 changes: 15 additions & 10 deletions runners/launch_b300-nv.sh
Original file line number Diff line number Diff line change
Expand Up @@ -248,27 +248,32 @@ find . -name '.nfs*' -delete 2>/dev/null || true

else

HF_HUB_CACHE_MOUNT="/scratch/models"
# Qwen3.5-397B-A17B-FP8 is pre-staged under /scratch/models on the B300 cluster,
# so point MODEL at the local copy. Other models fall through and use `hf download`
# against the mounted cache from their benchmark script.
if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then
export MODEL="/scratch/models/${MODEL#*/}"
fi
SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
HF_HUB_CACHE_MOUNT="/data/home/sa-shared/gharunners/hf-hub-cache"
SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh"
FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '')
SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '')
LOCK_FILE="${SQUASH_FILE}.lock"

# Pin to one of the known-good B300 nodes; others have hardware/network
# issues that cause benchmarks to hang or fail to start.
salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME"
JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1)

srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE"
# Use flock to serialize concurrent imports to the same squash file
srun --jobid=$JOB_ID bash -c "
exec 9>\"$LOCK_FILE\"
flock -w 600 9 || { echo 'Failed to acquire lock for $SQUASH_FILE'; exit 1; }
if unsquashfs -l \"$SQUASH_FILE\" > /dev/null 2>&1; then
echo 'Squash file already exists and is valid, skipping import'
else
rm -f \"$SQUASH_FILE\"
enroot import -o \"$SQUASH_FILE\" docker://$IMAGE
fi
"

srun --jobid=$JOB_ID \
--container-image=$SQUASH_FILE \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \
--container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \
--no-container-mount-home \
--container-workdir=/workspace/ \
--no-container-entrypoint --export=ALL,PORT=8888 \
Expand Down
Loading