diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 96273444f..6783dee76 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1796,6 +1796,34 @@ dsr1-fp8-b300-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } +# NOTE: Low-latency fallback (TP=8, EP=1, no DP-attn, no DeepEP) while +# the DeepEP FP8 weight-postprocess path is broken for DeepSeek-V4-Pro +# on B300. Re-introduce balanced/max-throughput rows once fixed upstream. +dsv4-fp4-b300-sglang: + image: lmsysorg/sglang:deepseek-v4-b300 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + # TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP) + # while the DeepEP FP8 weight-postprocess path is broken for this + # checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3 + # integers. raised from sglang.srt.layers.quantization.fp8 + # .process_weights_after_loading_block_quant). Full concurrency sweep + # retained; revert to the recipe-per-CONC split on chore/dsv4-sgl-b300 + # once sglang can load the checkpoint under --moe-a2a-backend deepep. + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 } + - isl: 8192 + osl: 1024 + search-space: + - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 } + qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e model: Qwen/Qwen3.5-397B-A17B diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh new file mode 100755 index 000000000..c9fb238a5 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b300.sh @@ -0,0 +1,103 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip +# `hf download`. Only fetch when MODEL looks like a HF repo ID. +if [[ "$MODEL" != /* ]]; then + hf download "$MODEL" +fi + +nvidia-smi + +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 + +# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its +# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half +# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks. +unset CUDA_VISIBLE_DEVICES + +# TODO(Cam): the deepseek-v4 sglang images install sglang editable at +# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. +# The runner mounts our repo at a non-/workspace path for these images so the +# editable install stays visible. Paths in this script are $PWD-relative for +# that reason. Drop the runner conditional once lmsys moves sglang back out of +# /workspace. + +SERVER_LOG="$PWD/server.log" +PORT=${PORT:-8888} + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor --output "$PWD/gpu_metrics.csv" + +# TODO(Cam): hardcoded to the low-latency recipe at every CONC until the +# DeepEP FP8 weight-postprocess path is fixed for this checkpoint on B300 +# (RuntimeError: Recipe must be a list/tuple of 3 integers. raised from +# sglang.srt.layers.quantization.fp8.process_weights_after_loading_block_quant). +# Restore the CONC-based low-latency / balanced / max-throughput dispatch +# on chore/dsv4-sgl-b300 once sglang can load the checkpoint under +# --moe-a2a-backend deepep. +RECIPE=low-latency +RECIPE_FLAGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 4096 + --disable-flashinfer-autotune + --mem-fraction-static 0.82 +) +echo "Recipe: $RECIPE (CONC=$CONC)" + +set -x +PYTHONNOUSERSITE=1 sglang serve \ + --model-path $MODEL \ + --host 0.0.0.0 \ + --port $PORT \ + --trust-remote-code \ + --tp $TP \ + --disable-radix-cache \ + "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $((CONC * 10)) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$PWD/" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 2b2e138c8..458994e1b 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1745,3 +1745,14 @@ - "VLLM_ENGINE_READY_TIMEOUT_S=3600 to accommodate large weight loading" - "Configs: 1k1k conc 4-64, 8k1k conc 4-64" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1130 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark (low-latency fallback)" + - "Image: lmsysorg/sglang:deepseek-v4-b300" + - "Model: deepseek-ai/DeepSeek-V4-Pro" + - "Low-latency only (TP=8, EP=1, no DP-attn, no DeepEP) — DeepEP FP8 weight-postprocess path is broken for this checkpoint on B300" + - "Prefix caching disabled, no speculative decoding" + - "Configs: 1k1k conc 4-1024, 8k1k conc 4-512" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1143 diff --git a/runners/launch_b200-dgxc-slurm.sh b/runners/launch_b200-dgxc-slurm.sh index c0f25310b..b9d4d90cc 100644 --- a/runners/launch_b200-dgxc-slurm.sh +++ b/runners/launch_b200-dgxc-slurm.sh @@ -249,8 +249,7 @@ EOF else - HF_HUB_CACHE_MOUNT="/scratch/fsw/models" - export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}" + HF_HUB_CACHE_MOUNT="/scratch/fsw/gharunners/hf-hub-cache" SQUASH_FILE="/home/sa-shared/containers/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') @@ -276,7 +275,7 @@ else srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ + --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE \ --no-container-mount-home \ --container-workdir=/workspace/ \ --no-container-entrypoint --export=ALL,PORT=8888 \ diff --git a/runners/launch_b300-nv.sh b/runners/launch_b300-nv.sh index b49391a3c..3daac0167 100644 --- a/runners/launch_b300-nv.sh +++ b/runners/launch_b300-nv.sh @@ -248,29 +248,57 @@ find . -name '.nfs*' -delete 2>/dev/null || true else - HF_HUB_CACHE_MOUNT="/scratch/models" - # Qwen3.5-397B-A17B-FP8 is pre-staged under /scratch/models on the B300 cluster, - # so point MODEL at the local copy. Other models fall through and use `hf download` - # against the mounted cache from their benchmark script. + # Pre-staged models on the B300 cluster live under /data/models. Point MODEL + # at the local copy so the benchmark skips `hf download` and reads from the + # mounted dir. Other models fall through and use `hf download` from their + # benchmark script. + HF_HUB_CACHE_MOUNT="/data/models" if [[ "$MODEL" == "Qwen/Qwen3.5-397B-A17B-FP8" ]]; then - export MODEL="/scratch/models/${MODEL#*/}" + export MODEL="$HF_HUB_CACHE_MOUNT/${MODEL#*/}" + elif [[ "$MODEL_PREFIX" == "dsv4" ]]; then + export MODEL="$HF_HUB_CACHE_MOUNT/dsv4-pro" fi - SQUASH_FILE="/data/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" + SQUASH_FILE="/data/home/sa-shared/gharunners/squash/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" FRAMEWORK_SUFFIX=$([[ "$FRAMEWORK" == "trt" ]] && printf '_trt' || printf '') SPEC_SUFFIX=$([[ "$SPEC_DECODING" == "mtp" ]] && printf '_mtp' || printf '') + LOCK_FILE="${SQUASH_FILE}.lock" + + # TODO(Cam): the deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell + # and its B300-recompiled forks like yhyang201/sglang-b300) install sglang + # editable at /workspace/sglang/python (prior sglang tags used /sgl-workspace/sglang), + # so the default $GITHUB_WORKSPACE:/workspace/ bind-mount masks the install + # and breaks `import sglang`. Mount these images at /ix instead; drop the + # conditional once the image stops installing editable under /workspace. + if [[ "$IMAGE" == *deepseek-v4-blackwell* || "$IMAGE" == *deepseek-v4-bw-ultra* || "$IMAGE" == *deepseek-v4-b300* || "$IMAGE" == *sglang-b300* ]]; then + CONTAINER_MOUNT_DIR=/ix + else + CONTAINER_MOUNT_DIR=/workspace + fi + + # Import the squash file on the head node (outside any srun) under flock. + # Parallel GH jobs target the same shared squash path; flock serializes + # imports so only one job pulls and writes the file while the rest wait. + ( + exec 9>"$LOCK_FILE" + flock -w 600 9 || { echo "Failed to acquire lock for $SQUASH_FILE" >&2; exit 1; } + if unsquashfs -l "$SQUASH_FILE" > /dev/null 2>&1; then + echo "Squash file already exists and is valid, skipping import" + else + rm -f "$SQUASH_FILE" + enroot import -o "$SQUASH_FILE" "docker://$IMAGE" + fi + ) # Pin to one of the known-good B300 nodes; others have hardware/network # issues that cause benchmarks to hang or fail to start. salloc --partition=$SLURM_PARTITION --account=$SLURM_ACCOUNT --nodelist=b300-[001-006,008-012,017-020] -N 1 --gres=gpu:$TP --exclusive --time=180 --no-shell --job-name="$RUNNER_NAME" JOB_ID=$(squeue --name="$RUNNER_NAME" -u "$USER" -h -o %A | head -n1) - srun --jobid=$JOB_ID bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - srun --jobid=$JOB_ID \ --container-image=$SQUASH_FILE \ - --container-mounts=$GITHUB_WORKSPACE:/workspace/,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ + --container-mounts=$GITHUB_WORKSPACE:$CONTAINER_MOUNT_DIR,$HF_HUB_CACHE_MOUNT:$HF_HUB_CACHE_MOUNT \ --no-container-mount-home \ - --container-workdir=/workspace/ \ + --container-workdir=$CONTAINER_MOUNT_DIR \ --no-container-entrypoint --export=ALL,PORT=8888 \ bash benchmarks/single_node/${EXP_NAME%%_*}_${PRECISION}_b300${FRAMEWORK_SUFFIX}${SPEC_SUFFIX}.sh