diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index 1d467308f..e24408017 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1832,9 +1832,10 @@ dsr1-fp8-b300-sglang: - { tp: 8, ep: 1, conc-start: 4, conc-end: 4 } - { tp: 4, ep: 1, conc-start: 4, conc-end: 32 } -# NOTE: Low-latency fallback (TP=8, EP=1, no DP-attn, no DeepEP) while -# the DeepEP FP8 weight-postprocess path is broken for DeepSeek-V4-Pro -# on B300. Re-introduce balanced/max-throughput rows once fixed upstream. +# NOTE: https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# lists B200 (not B300) as the Blackwell target; we reuse the B200 Pro FP4 +# recipes on B300 until a B300-specific recipe ships. Prefix caching is +# disabled. Parallelisms mirror dsv4-fp4-b200-sglang. dsv4-fp4-b300-sglang: image: lmsysorg/sglang:deepseek-v4-b300 model: deepseek-ai/DeepSeek-V4-Pro @@ -1843,22 +1844,63 @@ dsv4-fp4-b300-sglang: precision: fp4 framework: sglang multinode: false - # TODO(Cam): low-latency recipe only (TP-only, no DP-attn, no DeepEP) - # while the DeepEP FP8 weight-postprocess path is broken for this - # checkpoint on B300 (RuntimeError: Recipe must be a list/tuple of 3 - # integers. raised from sglang.srt.layers.quantization.fp8 - # .process_weights_after_loading_block_quant). Full concurrency sweep - # retained; revert to the recipe-per-CONC split on chore/dsv4-sgl-b300 - # once sglang can load the checkpoint under --moe-a2a-backend deepep. + # Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 + # are selected inside benchmarks/single_node/dsv4_fp4_b300.sh by CONC: + # low-latency (CONC <= 32): TP-only + # balanced (32 < CONC <= 128): + DP-attn + # max-throughput (CONC > 128): + DP-attn + # Split so result filenames (ep=, dpa=) accurately reflect the recipe. + # ep is implicit in sglang: --moe-a2a-backend deepep forces ep_size=tp_size, + # while low-latency leaves ep_size at the default of 1. + seq-len-configs: + - isl: 1024 + osl: 1024 + search-space: + # low-latency + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + # balanced + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } + # max-throughput + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 1024 } + - isl: 8192 + osl: 1024 + search-space: + # low-latency + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32 } + # balanced + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128 } + # max-throughput + - { tp: 8, ep: 8, dp-attn: true, conc-start: 256, conc-end: 512 } + +dsv4-fp4-b300-sglang-mtp: + image: lmsysorg/sglang:deepseek-v4-b300 + model: deepseek-ai/DeepSeek-V4-Pro + model-prefix: dsv4 + runner: b300 + precision: fp4 + framework: sglang + multinode: false + # Mirrors dsv4-fp4-b300-sglang's low-latency and balanced rows with EAGLE + # MTP enabled per https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4: + # low-latency (CONC <= 32): EAGLE 3 steps / 4 draft tokens + # balanced (32 < CONC <= 128): EAGLE 1 step / 2 draft tokens + # Max-throughput is intentionally omitted -- the cookbook says MTP off + # at saturation because the verify step costs more than it saves. seq-len-configs: - isl: 1024 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 1024 } + # low-latency + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } + # balanced + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp } - isl: 8192 osl: 1024 search-space: - - { tp: 8, ep: 1, conc-start: 4, conc-end: 512 } + # low-latency + - { tp: 8, ep: 1, conc-start: 4, conc-end: 32, spec-decoding: mtp } + # balanced + - { tp: 8, ep: 8, dp-attn: true, conc-start: 64, conc-end: 128, spec-decoding: mtp } qwen3.5-bf16-b200-sglang: image: lmsysorg/sglang:nightly-dev-20260216-d3bae71e diff --git a/benchmarks/single_node/dsv4_fp4_b300.sh b/benchmarks/single_node/dsv4_fp4_b300.sh new file mode 100755 index 000000000..faa946174 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b300.sh @@ -0,0 +1,129 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip +# `hf download`. Only fetch when MODEL looks like a HF repo ID. +if [[ "$MODEL" != /* ]]; then + hf download "$MODEL" +fi + +nvidia-smi + +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 + +# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its +# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half +# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks. +unset CUDA_VISIBLE_DEVICES + +# TODO(Cam): the deepseek-v4 sglang images install sglang editable at +# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. +# The runner mounts our repo at a non-/workspace path for these images so the +# editable install stays visible. Paths in this script are $PWD-relative for +# that reason. Drop the runner conditional once lmsys moves sglang back out of +# /workspace. + +SERVER_LOG="$PWD/server.log" +PORT=${PORT:-8888} + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor --output "$PWD/gpu_metrics.csv" + +# Three recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# (spec-decoding / MTP and prefix-caching flags dropped for the baseline): +# - low-latency (CONC <= 32): TP-only, chunked-prefill, disable autotune +# - balanced (32 < CONC <= 128): + DP-attn, max-running-requests=128 +# - max-throughput (CONC > 128): + DP-attn, max-running-requests=256 +DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + +if [[ $CONC -le 32 ]]; then + RECIPE=low-latency + RECIPE_FLAGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 4096 + --disable-flashinfer-autotune + --mem-fraction-static 0.82 + ) +elif [[ $CONC -le 128 ]]; then + RECIPE=balanced + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 + --cuda-graph-max-bs 64 + --max-running-requests 128 + ) +else + RECIPE=max-throughput + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 + --cuda-graph-max-bs 64 + --max-running-requests 256 + ) +fi +echo "Recipe: $RECIPE (CONC=$CONC)" + +set -x +PYTHONNOUSERSITE=1 sglang serve \ + --model-path $MODEL \ + --host 0.0.0.0 \ + --port $PORT \ + --trust-remote-code \ + --tp $TP \ + --disable-radix-cache \ + "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $((CONC * 10)) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$PWD/" + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/benchmarks/single_node/dsv4_fp4_b300_mtp.sh b/benchmarks/single_node/dsv4_fp4_b300_mtp.sh new file mode 100755 index 000000000..2e383ead2 --- /dev/null +++ b/benchmarks/single_node/dsv4_fp4_b300_mtp.sh @@ -0,0 +1,130 @@ +#!/usr/bin/env bash + +source "$(dirname "$0")/../benchmark_lib.sh" + +check_env_vars \ + MODEL \ + TP \ + CONC \ + ISL \ + OSL \ + RANDOM_RANGE_RATIO \ + RESULT_FILENAME + +if [[ -n "$SLURM_JOB_ID" ]]; then + echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" +fi + +# The B300 runner overrides MODEL to a pre-staged /data/models path, so skip +# `hf download`. Only fetch when MODEL looks like a HF repo ID. +if [[ "$MODEL" != /* ]]; then + hf download "$MODEL" +fi + +nvidia-smi + +export SGLANG_JIT_DEEPGEMM_PRECOMPILE=0 +# Cookbook note: "MTP currently requires SGLANG_ENABLE_SPEC_V2=1." +export SGLANG_ENABLE_SPEC_V2=1 + +# The deepseek-v4 sglang images (lmsysorg/sglang:deepseek-v4-blackwell and its +# B300 forks) bake CUDA_VISIBLE_DEVICES=4,5,6,7 into their ENV, which masks half +# of the 8 GPUs Slurm allocates us. Clear it so TP=8 can bind to all ranks. +unset CUDA_VISIBLE_DEVICES + +# TODO(Cam): the deepseek-v4 sglang images install sglang editable at +# /workspace/sglang/python; prior sglang tags used /sgl-workspace/sglang. +# The runner mounts our repo at a non-/workspace path for these images so the +# editable install stays visible. Paths in this script are $PWD-relative for +# that reason. Drop the runner conditional once lmsys moves sglang back out of +# /workspace. + +SERVER_LOG="$PWD/server.log" +PORT=${PORT:-8888} + +echo "TP: $TP, CONC: $CONC, ISL: $ISL, OSL: $OSL" + +EVAL_CONTEXT_ARGS="" +if [ "${EVAL_ONLY}" = "true" ]; then + setup_eval_context + EVAL_CONTEXT_ARGS="--context-length $EVAL_MAX_MODEL_LEN" +fi + +start_gpu_monitor --output "$PWD/gpu_metrics.csv" + +# Two recipes from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 +# with EAGLE / MTP enabled per the cookbook (prefix-caching dropped): +# - low-latency (CONC <= 32): TP-only + EAGLE 3 steps / 4 draft tokens +# - balanced (32 < CONC <= 128): + DP-attn + EAGLE 1 step / 2 draft tokens +# Max-throughput is intentionally not handled here -- the cookbook says +# MTP off at saturation because the verify step costs more than it saves. +# dsv4-fp4-b300-sglang-mtp's search-space caps CONC at 128 to match. +DEEPEP_CONFIG='{"normal_dispatch":{"num_sms":96},"normal_combine":{"num_sms":96}}' + +if [[ $CONC -le 32 ]]; then + RECIPE=low-latency + RECIPE_FLAGS=( + --moe-runner-backend flashinfer_mxfp4 + --chunked-prefill-size 4096 + --disable-flashinfer-autotune + --mem-fraction-static 0.82 + --speculative-algo EAGLE + --speculative-num-steps 3 + --speculative-eagle-topk 1 + --speculative-num-draft-tokens 4 + ) +else + RECIPE=balanced + export SGLANG_DEEPEP_NUM_MAX_DISPATCH_TOKENS_PER_RANK=256 + RECIPE_FLAGS=( + --dp-size "$TP" + --enable-dp-attention + --moe-a2a-backend deepep + --deepep-config "$DEEPEP_CONFIG" + --mem-fraction-static 0.82 + --cuda-graph-max-bs 64 + --max-running-requests 128 + --speculative-algo EAGLE + --speculative-num-steps 1 + --speculative-eagle-topk 1 + --speculative-num-draft-tokens 2 + ) +fi +echo "Recipe: $RECIPE (CONC=$CONC)" + +set -x +PYTHONNOUSERSITE=1 sglang serve \ + --model-path $MODEL \ + --host 0.0.0.0 \ + --port $PORT \ + --trust-remote-code \ + --tp $TP \ + --disable-radix-cache \ + "${RECIPE_FLAGS[@]}" $EVAL_CONTEXT_ARGS > $SERVER_LOG 2>&1 & + +SERVER_PID=$! + +wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" + +pip install -q datasets pandas + +run_benchmark_serving \ + --model "$MODEL" \ + --port "$PORT" \ + --backend vllm \ + --input-len "$ISL" \ + --output-len "$OSL" \ + --random-range-ratio "$RANDOM_RANGE_RATIO" \ + --num-prompts $((CONC * 10)) \ + --max-concurrency "$CONC" \ + --result-filename "$RESULT_FILENAME" \ + --result-dir "$PWD/" \ + --use-chat-template + +if [ "${RUN_EVAL}" = "true" ]; then + run_eval --framework lm-eval --port "$PORT" + append_lm_eval_summary +fi + +stop_gpu_monitor +set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index a6c811748..64ff305f8 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1812,3 +1812,26 @@ - "Topologies: low-conc 1p1d-dep8-tep8 (4 nodes, mirrored from NVIDIA srt-slurm PR #71 with offload kept and numa-bind dropped); mid 1p1d-dep8-dep16 (6 nodes) and high 3p1d-dep8-dep16 (10 nodes) hand-rolled, structurally derived from the kimi-k2.5 1k/1k pattern" - "Recipes stored under benchmarks/multi_node/srt-slurm-recipes/ and overlaid onto the upstream srt-slurm checkout at runtime" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1129 + +- config-keys: + - dsv4-fp4-b300-sglang + description: + - "Add DeepSeek-V4-Pro FP4 B300 SGLang benchmark" + - "Image: lmsysorg/sglang:deepseek-v4-blackwell" + - "Model: deepseek-ai/DeepSeek-V4-Pro (FP4 MoE experts + FP8 attention/dense)" + - "Reuses the B200 Pro Max-Throughput recipe from https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4 on B300 until a B300-specific recipe ships" + - "DP=8 + DeepEP, prefix caching disabled, no speculative decoding" + - "Parallelism (TP=8/EP=8/dp-attn=true) and concurrency ranges (4-1024 for 1k1k, 4-512 for 8k1k) mirror dsv4-fp4-b200-vllm" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1132 + +- config-keys: + - dsv4-fp4-b300-sglang-mtp + description: + - "Add DeepSeek-V4-Pro FP4 B300 SGLang MTP benchmark" + - "Image: lmsysorg/sglang:deepseek-v4-b300" + - "Model: deepseek-ai/DeepSeek-V4-Pro (FP4 MoE experts + FP8 attention/dense)" + - "Mirrors dsv4-fp4-b300-sglang's low-latency and balanced rows with EAGLE MTP enabled per https://docs.sglang.io/cookbook/autoregressive/DeepSeek/DeepSeek-V4" + - "EAGLE 3 steps / 4 draft tokens on low-latency, 1 step / 2 draft tokens on balanced" + - "Max-throughput intentionally omitted: cookbook says MTP off at saturation because the verify step costs more than it saves" + - "SGLANG_ENABLE_SPEC_V2=1 required for MTP; --use-chat-template passed to bench_serving" + pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1151